aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp4
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp248
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp248
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp118
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp118
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp284
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp320
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp776
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp754
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1228
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1238
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp492
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp644
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp552
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp716
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp142
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp420
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp320
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp716
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp670
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1176
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1106
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp612
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp544
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp684
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp142
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp214
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp244
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp312
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1720
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp2146
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp168
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp498
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp608
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp416
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2348
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp2830
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1720
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp2146
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp168
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp498
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp608
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp416
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1824
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp2168
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3586
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1720
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp2146
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3208
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp168
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp416
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp296
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp562
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp614
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp968
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp346
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp286
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp8
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp726
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp784
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp360
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp296
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp584
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp438
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp760
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp424
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp360
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp388
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp536
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp942
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp1254
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp790
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp1479
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp1531
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp767
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp1065
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp1641
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp1811
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp767
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp1065
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1641
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1811
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp767
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp1065
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1641
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1811
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp278
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp686
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp656
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1070
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1024
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp456
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp616
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp862
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp278
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp686
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp656
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1070
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1024
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp456
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp616
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp862
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp120
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp200
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp380
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp502
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp840
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp668
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp410
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp516
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp666
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp840
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp668
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp410
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp516
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp590
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp668
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp1000
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp19
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp226
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp344
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp88
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp330
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp226
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp296
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp88
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp282
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp176
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp88
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp286
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp628
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp230
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp88
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp426
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp344
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp826
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp158
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp98
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp108
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp158
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp98
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp108
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp142
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp108
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp160
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp360
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp190
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp108
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp204
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp436
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp174
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp44
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp218
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp174
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp44
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp218
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp290
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp218
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp332
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp528
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp284
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp218
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp352
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp586
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp11
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp6
197 files changed, 63314 insertions, 63175 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
index 15064aeedc..52ecaff0a8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -133,7 +133,7 @@ bool has_channel_multiplier(const DepthwiseArgs &args, const void *)
bool no_prime_right_pad(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
bool no_prime_right_pad(const DepthwiseArgs &args, const void *)
{
- return (args.input_cols + args.padding.left) >= (args.kernel_cols - 1);
+ return ((args.input_cols + args.padding.left) / args.dilation_cols) >= (args.kernel_cols - 1);
}
bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
index c3daaf04fe..adcbedf4ce 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -210,13 +210,30 @@ struct OutputRowPtrsElement
T *output_padding_buffer;
};
+ // On some implementations there is a significant performance benefit to
+ // aligning the padding buffer to a 1024 byte boundary. This routine
+ // adds as much padding as needed to an arbitrary input pointer and
+ // returns an aligned void *.
+ static constexpr intptr_t BUFFER_ALIGNMENT=1024;
+
+ template <typename ptr_T>
+ static void *do_align(ptr_T in)
+ {
+ intptr_t v = reinterpret_cast<intptr_t>(in);
+ intptr_t odds = v & (BUFFER_ALIGNMENT-1);
+ intptr_t pad = odds ? (BUFFER_ALIGNMENT - odds) : 0;
+
+ return reinterpret_cast<void *>(v + pad);
+ }
+
template <typename OutputStage>
static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
{
- // We need one pointer and stride for each row of output, and an additional
- // blob of memory into which padded stores can go.
+ // We need one pointer and stride for each row of output, and an
+ // additional blob of memory into which padded stores can go. Allow
+ // extra space so that this padding buffer can be aligned at both ends.
return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
- get_vector_length<char>(args.strategy->get_vl_type());
+ get_vector_length<char>(args.strategy->get_vl_type()) + BUFFER_ALIGNMENT*2;
}
template <typename WorkspaceType, typename OutputStage>
@@ -227,8 +244,8 @@ struct OutputRowPtrsElement
ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
ws->output_ld_vls = ws->output_ld_cols + n_rows;
- ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
- return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+ ws->output_padding_buffer = reinterpret_cast<T *>(do_align(ws->output_ld_vls + n_rows));
+ return do_align(ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type()));
}
};
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
index 3de4bdc1fb..f18208d6c4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,189 +52,189 @@ void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
{
__asm__ __volatile__(
"cmp %x[ld_weight_col], XZR\n"
- "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mov x22, #0x3\n"
"movi v16.4s, #0x9\n"
- "movi v31.16b, #0x0\n"
- "mov x21, #0x3\n"
- "mul x21, %x[ld_weight_col], x21\n"
+ "movi v0.16b, #0x0\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
+ "movi v31.16b, #0x1\n"
"ld1r { v30.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_weights_offset]\n"
- "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v29.4s, v29.4s, v30.4s\n"
- "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+ "ld1r { v29.4s }, [x20]\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v28.16b, #0x1\n"
- "mul v29.4s, v29.4s, v16.4s\n"
- "add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v27.4s }, [x20]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "mul x22, %x[ld_weight_col], x22\n"
"add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
- "ld1r { v26.4s }, [x20]\n"
- "add x24, x25, %x[ld_weight_row]\n"
- "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
- "mov x22, #0x0\n"
+ "add x25, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "mov x24, #0x0\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x22, NE\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x23, %x[weights], %x[ld_weight_row]\n"
+ "add x22, x23, %x[ld_weight_row]\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q25, [%x[bias], x22]\n"
+ "ldr q26, [%x[bias], x24]\n"
"2:" // Loop: Skip bias load
- "ldr s19, [%x[weights], #0x0]\n"
- "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v17.16b, v16.16b, v31.16b\n"
- "movi v21.4s, #0x0\n"
- "ldr s16, [%x[weights], x23]\n"
- "ldr s18, [x25, #0x0]\n"
- "zip1 v16.16b, v19.16b, v16.16b\n"
- "zip1 v20.16b, v16.16b, v17.16b\n"
- "ldr s17, [x25, %x[ld_weight_col]]\n"
- "ldr s16, [x25, x23]\n"
- "zip1 v18.16b, v18.16b, v16.16b\n"
- "zip1 v16.16b, v17.16b, v31.16b\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x4e949795 // sdot v21.4s, v28.16b, v20.16b\n"
- "zip1 v18.16b, v18.16b, v16.16b\n"
- "ldr s16, [x24, x23]\n"
- "zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v31.16b\n"
- ".inst 0x4e929795 // sdot v21.4s, v28.16b, v18.16b\n"
- "zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e909795 // sdot v21.4s, v28.16b, v16.16b\n"
+ "ldr s25, [%x[weights], #0x0]\n"
+ "ldr s18, [%x[weights], %x[ld_weight_col]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr s16, [%x[weights], x25]\n"
+ "ldr s20, [x23, #0x0]\n"
"add %x[weights], %x[weights], #0x4\n"
- "add x25, x25, #0x4\n"
- "mls v25.4s, v21.4s, v30.4s\n"
- "add x24, x24, #0x4\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q25, [%x[outptr], #0x0]\n"
- "str q20, [%x[outptr], #0x10]\n"
- "str q18, [%x[outptr], #0x20]\n"
+ "ldr s23, [x23, %x[ld_weight_col]]\n"
+ "ldr s17, [x23, x25]\n"
+ "add x23, x23, #0x4\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s19, [x22, %x[ld_weight_col]]\n"
+ "zip1 v18.16b, v18.16b, v0.16b\n"
+ "ldr s21, [x22, x25]\n"
+ "zip1 v16.16b, v25.16b, v16.16b\n"
+ "add x22, x22, #0x4\n"
+ "zip1 v20.16b, v20.16b, v17.16b\n"
+ "zip1 v17.16b, v23.16b, v0.16b\n"
+ "zip1 v19.16b, v19.16b, v0.16b\n"
+ "zip1 v18.16b, v16.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v21.16b\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ ".inst 0x4e9297f8 // sdot v24.4s, v31.16b, v18.16b\n"
+ "zip1 v16.16b, v16.16b, v19.16b\n"
+ ".inst 0x4e9197f8 // sdot v24.4s, v31.16b, v17.16b\n"
+ ".inst 0x4e9097f8 // sdot v24.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v24.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q18, [%x[outptr], #0x10]\n"
+ "str q17, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q27, [%x[rq_mul_perchannel], x22]\n"
- "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q28, [%x[rq_mul_perchannel], x24]\n"
+ "ldr q27, [%x[rq_shift_perchannel], x24]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q27, [%x[outptr], #0x0]\n"
- "add x22, x22, #0x10\n"
- "str q26, [%x[outptr], #0x10]\n"
+ "str q28, [%x[outptr], #0x0]\n"
+ "add x24, x24, #0x10\n"
+ "str q27, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"cbz %x[bias], 7f\n"
- "add %x[bias], %x[bias], x22\n"
+ "add %x[bias], %x[bias], x24\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v17.h }[0], [%x[weights]]\n"
- "ld1 { v24.h }[0], [x25]\n"
+ "ld1 { v18.h }[0], [%x[weights]]\n"
+ "ld1 { v21.h }[0], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.h }[0], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.h }[0], [x21]\n"
"ld1 { v16.h }[0], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.h }[0], [x21]\n"
- "ld1 { v18.h }[0], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.h }[0], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v17.h }[0], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.h }[0], [x22]\n"
"ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
- "add x25, x25, #0x2\n"
- "ld1 { v21.h }[0], [x20]\n"
- "add x24, x24, #0x2\n"
+ "add x23, x23, #0x2\n"
+ "ld1 { v19.h }[0], [x20]\n"
+ "add x22, x22, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v17.b }[2], [%x[weights]]\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "ld1 { v18.b }[2], [%x[weights]]\n"
+ "ld1 { v21.b }[2], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.b }[2], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.b }[2], [x21]\n"
"ld1 { v16.b }[2], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.b }[2], [x21]\n"
- "ld1 { v18.b }[2], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.b }[2], [x22]\n"
"ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v17.b }[0], [%x[weights]]\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "ld1 { v18.b }[0], [%x[weights]]\n"
+ "ld1 { v21.b }[0], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.b }[0], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.b }[0], [x21]\n"
"ld1 { v16.b }[0], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.b }[0], [x21]\n"
- "ld1 { v18.b }[0], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.b }[0], [x22]\n"
"ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v20.16b, v31.16b\n"
- "zip1 v20.16b, v17.16b, v16.16b\n"
- "zip1 v17.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v31.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4e949793 // sdot v19.4s, v28.16b, v20.16b\n"
- "zip1 v18.16b, v17.16b, v16.16b\n"
- "zip1 v17.16b, v23.16b, v21.16b\n"
- ".inst 0x4e929793 // sdot v19.4s, v28.16b, v18.16b\n"
- "zip1 v16.16b, v22.16b, v31.16b\n"
- "zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x4e909793 // sdot v19.4s, v28.16b, v16.16b\n"
- "mls v25.4s, v19.4s, v30.4s\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q25, [%x[outptr], #0x0]\n"
- "str q20, [%x[outptr], #0x10]\n"
- "str q18, [%x[outptr], #0x20]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v24.16b, v0.16b\n"
+ "zip1 v21.16b, v21.16b, v17.16b\n"
+ "zip1 v17.16b, v20.16b, v0.16b\n"
+ "movi v20.4s, #0x0\n"
+ "zip1 v19.16b, v23.16b, v19.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v22.16b, v0.16b\n"
+ "zip1 v17.16b, v21.16b, v17.16b\n"
+ ".inst 0x4e9297f4 // sdot v20.4s, v31.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ ".inst 0x4e9197f4 // sdot v20.4s, v31.16b, v17.16b\n"
+ ".inst 0x4e9097f4 // sdot v20.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v20.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q18, [%x[outptr], #0x10]\n"
+ "str q17, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 12f\n"
- "add x21, %x[rq_mul_perchannel], x22\n"
- "add x20, %x[rq_shift_perchannel], x22\n"
+ "add x21, %x[rq_mul_perchannel], x24\n"
+ "add x20, %x[rq_shift_perchannel], x24\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v27.d }[0], [x21], #0x8\n"
- "ld1 { v26.d }[0], [x20], #0x8\n"
+ "ld1 { v28.d }[0], [x21], #0x8\n"
+ "ld1 { v27.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v27.s }[2], [x21], #0x4\n"
- "ld1 { v26.s }[2], [x20], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v27.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q27, [%x[outptr], #0x0]\n"
- "str q26, [%x[outptr], #0x10]\n"
+ "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
index 19264c9fce..0ebf6ac10f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,189 +52,189 @@ void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
{
__asm__ __volatile__(
"cmp %x[ld_weight_col], XZR\n"
- "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mov x22, #0x3\n"
"movi v16.4s, #0x9\n"
- "movi v31.16b, #0x0\n"
- "mov x21, #0x3\n"
- "mul x21, %x[ld_weight_col], x21\n"
+ "movi v0.16b, #0x0\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
"add x20, %x[qp], %[offsetof_input_offset]\n"
+ "movi v31.16b, #0x1\n"
"ld1r { v30.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_weights_offset]\n"
- "ld1r { v29.4s }, [x20]\n"
"cmp %x[ld_weight_row], XZR\n"
- "mul v29.4s, v29.4s, v30.4s\n"
- "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+ "ld1r { v29.4s }, [x20]\n"
"lsr x21, %x[n_channels], #0x2\n"
- "movi v28.16b, #0x1\n"
- "mul v29.4s, v29.4s, v16.4s\n"
- "add x25, %x[weights], %x[ld_weight_row]\n"
"add x20, %x[qp], %[offsetof_per_layer_mul]\n"
- "ld1r { v27.4s }, [x20]\n"
+ "ld1r { v28.4s }, [x20]\n"
+ "mul x22, %x[ld_weight_col], x22\n"
"add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
- "ld1r { v26.4s }, [x20]\n"
- "add x24, x25, %x[ld_weight_row]\n"
- "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
- "mov x22, #0x0\n"
+ "add x25, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "mov x24, #0x0\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x22, NE\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x23, %x[weights], %x[ld_weight_row]\n"
+ "add x22, x23, %x[ld_weight_row]\n"
"cbz x21, 4f\n"
"1:" // Loop
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"cbz %x[bias], 2f\n"
- "ldr q25, [%x[bias], x22]\n"
+ "ldr q26, [%x[bias], x24]\n"
"2:" // Loop: Skip bias load
- "ldr s19, [%x[weights], #0x0]\n"
- "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
- "zip1 v17.16b, v16.16b, v31.16b\n"
- "movi v21.4s, #0x0\n"
- "ldr s16, [%x[weights], x23]\n"
- "ldr s18, [x25, #0x0]\n"
- "zip1 v16.16b, v19.16b, v16.16b\n"
- "zip1 v20.16b, v16.16b, v17.16b\n"
- "ldr s17, [x25, %x[ld_weight_col]]\n"
- "ldr s16, [x25, x23]\n"
- "zip1 v18.16b, v18.16b, v16.16b\n"
- "zip1 v16.16b, v17.16b, v31.16b\n"
- "ldr s17, [x24, #0x0]\n"
- "ldr s19, [x24, %x[ld_weight_col]]\n"
- ".inst 0x6e949795 // udot v21.4s, v28.16b, v20.16b\n"
- "zip1 v18.16b, v18.16b, v16.16b\n"
- "ldr s16, [x24, x23]\n"
- "zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v19.16b, v31.16b\n"
- ".inst 0x6e929795 // udot v21.4s, v28.16b, v18.16b\n"
- "zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e909795 // udot v21.4s, v28.16b, v16.16b\n"
+ "ldr s25, [%x[weights], #0x0]\n"
+ "ldr s18, [%x[weights], %x[ld_weight_col]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr s16, [%x[weights], x25]\n"
+ "ldr s20, [x23, #0x0]\n"
"add %x[weights], %x[weights], #0x4\n"
- "add x25, x25, #0x4\n"
- "mls v25.4s, v21.4s, v30.4s\n"
- "add x24, x24, #0x4\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q25, [%x[outptr], #0x0]\n"
- "str q20, [%x[outptr], #0x10]\n"
- "str q18, [%x[outptr], #0x20]\n"
+ "ldr s23, [x23, %x[ld_weight_col]]\n"
+ "ldr s17, [x23, x25]\n"
+ "add x23, x23, #0x4\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s19, [x22, %x[ld_weight_col]]\n"
+ "zip1 v18.16b, v18.16b, v0.16b\n"
+ "ldr s21, [x22, x25]\n"
+ "zip1 v16.16b, v25.16b, v16.16b\n"
+ "add x22, x22, #0x4\n"
+ "zip1 v20.16b, v20.16b, v17.16b\n"
+ "zip1 v17.16b, v23.16b, v0.16b\n"
+ "zip1 v19.16b, v19.16b, v0.16b\n"
+ "zip1 v18.16b, v16.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v21.16b\n"
+ "zip1 v17.16b, v20.16b, v17.16b\n"
+ ".inst 0x6e9297f8 // udot v24.4s, v31.16b, v18.16b\n"
+ "zip1 v16.16b, v16.16b, v19.16b\n"
+ ".inst 0x6e9197f8 // udot v24.4s, v31.16b, v17.16b\n"
+ ".inst 0x6e9097f8 // udot v24.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v24.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q18, [%x[outptr], #0x10]\n"
+ "str q17, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ldr q27, [%x[rq_mul_perchannel], x22]\n"
- "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+ "ldr q28, [%x[rq_mul_perchannel], x24]\n"
+ "ldr q27, [%x[rq_shift_perchannel], x24]\n"
"3:" // Loop: Quantisation parameters: Store
"subs x21, x21, #0x1\n"
- "str q27, [%x[outptr], #0x0]\n"
- "add x22, x22, #0x10\n"
- "str q26, [%x[outptr], #0x10]\n"
+ "str q28, [%x[outptr], #0x0]\n"
+ "add x24, x24, #0x10\n"
+ "str q27, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"bgt 1b\n"
"tst %x[n_channels], #0x3\n"
"beq 13f\n"
"4:" // Oddments
- "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
"cbz %x[bias], 7f\n"
- "add %x[bias], %x[bias], x22\n"
+ "add %x[bias], %x[bias], x24\n"
"tbz %x[n_channels], #1, 5f\n"
- "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+ "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
"tbz %x[n_channels], #0, 6f\n"
- "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+ "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
"b 6f\n"
"5:" // Oddments: Load bias: Bit 1: Unset
- "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+ "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
"6:" // Oddments: Load bias: Bit 1: End
"7:" // Oddments: Skip bias load
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v17.h }[0], [%x[weights]]\n"
- "ld1 { v24.h }[0], [x25]\n"
+ "ld1 { v18.h }[0], [%x[weights]]\n"
+ "ld1 { v21.h }[0], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.h }[0], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.h }[0], [x21]\n"
"ld1 { v16.h }[0], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.h }[0], [x21]\n"
- "ld1 { v18.h }[0], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.h }[0], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v17.h }[0], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.h }[0], [x22]\n"
"ld1 { v22.h }[0], [x21]\n"
"add %x[weights], %x[weights], #0x2\n"
- "add x25, x25, #0x2\n"
- "ld1 { v21.h }[0], [x20]\n"
- "add x24, x24, #0x2\n"
+ "add x23, x23, #0x2\n"
+ "ld1 { v19.h }[0], [x20]\n"
+ "add x22, x22, #0x2\n"
"tbz %x[n_channels], #0, 9f\n"
- "ld1 { v17.b }[2], [%x[weights]]\n"
- "ld1 { v24.b }[2], [x25]\n"
+ "ld1 { v18.b }[2], [%x[weights]]\n"
+ "ld1 { v21.b }[2], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.b }[2], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.b }[2], [x21]\n"
"ld1 { v16.b }[2], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.b }[2], [x21]\n"
- "ld1 { v18.b }[2], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.b }[2], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.b }[2], [x22]\n"
"ld1 { v22.b }[2], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 9f\n"
"8:" // Oddments: Load weights: Bit 1: Unset
- "ld1 { v17.b }[0], [%x[weights]]\n"
- "ld1 { v24.b }[0], [x25]\n"
+ "ld1 { v18.b }[0], [%x[weights]]\n"
+ "ld1 { v21.b }[0], [x23]\n"
"add x21, %x[weights], %x[ld_weight_col]\n"
- "add x20, %x[weights], x23\n"
- "ld1 { v20.b }[0], [x21]\n"
+ "add x20, %x[weights], x25\n"
+ "ld1 { v24.b }[0], [x21]\n"
"ld1 { v16.b }[0], [x20]\n"
- "add x21, x25, %x[ld_weight_col]\n"
- "add x20, x25, x23\n"
- "ld1 { v19.b }[0], [x21]\n"
- "ld1 { v18.b }[0], [x20]\n"
- "add x21, x24, %x[ld_weight_col]\n"
- "add x20, x24, x23\n"
- "ld1 { v23.b }[0], [x24]\n"
+ "add x21, x23, %x[ld_weight_col]\n"
+ "add x20, x23, x25\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "add x21, x22, %x[ld_weight_col]\n"
+ "add x20, x22, x25\n"
+ "ld1 { v23.b }[0], [x22]\n"
"ld1 { v22.b }[0], [x21]\n"
"add %x[weights], %x[weights], #0x1\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"9:" // Oddments: Load weights: Bit 1: End
- "zip1 v17.16b, v17.16b, v16.16b\n"
- "zip1 v16.16b, v20.16b, v31.16b\n"
- "zip1 v20.16b, v17.16b, v16.16b\n"
- "zip1 v17.16b, v24.16b, v18.16b\n"
- "zip1 v16.16b, v19.16b, v31.16b\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6e949793 // udot v19.4s, v28.16b, v20.16b\n"
- "zip1 v18.16b, v17.16b, v16.16b\n"
- "zip1 v17.16b, v23.16b, v21.16b\n"
- ".inst 0x6e929793 // udot v19.4s, v28.16b, v18.16b\n"
- "zip1 v16.16b, v22.16b, v31.16b\n"
- "zip1 v16.16b, v17.16b, v16.16b\n"
- ".inst 0x6e909793 // udot v19.4s, v28.16b, v16.16b\n"
- "mls v25.4s, v19.4s, v30.4s\n"
- "add v25.4s, v25.4s, v29.4s\n"
- "str q25, [%x[outptr], #0x0]\n"
- "str q20, [%x[outptr], #0x10]\n"
- "str q18, [%x[outptr], #0x20]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v24.16b, v0.16b\n"
+ "zip1 v21.16b, v21.16b, v17.16b\n"
+ "zip1 v17.16b, v20.16b, v0.16b\n"
+ "movi v20.4s, #0x0\n"
+ "zip1 v19.16b, v23.16b, v19.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v22.16b, v0.16b\n"
+ "zip1 v17.16b, v21.16b, v17.16b\n"
+ ".inst 0x6e9297f4 // udot v20.4s, v31.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ ".inst 0x6e9197f4 // udot v20.4s, v31.16b, v17.16b\n"
+ ".inst 0x6e9097f4 // udot v20.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v20.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q18, [%x[outptr], #0x10]\n"
+ "str q17, [%x[outptr], #0x20]\n"
"str q16, [%x[outptr], #0x30]\n"
"add %x[outptr], %x[outptr], #0x40\n"
"cbz %x[rq_mul_perchannel], 12f\n"
- "add x21, %x[rq_mul_perchannel], x22\n"
- "add x20, %x[rq_shift_perchannel], x22\n"
+ "add x21, %x[rq_mul_perchannel], x24\n"
+ "add x20, %x[rq_shift_perchannel], x24\n"
"tbz %x[n_channels], #1, 10f\n"
- "ld1 { v27.d }[0], [x21], #0x8\n"
- "ld1 { v26.d }[0], [x20], #0x8\n"
+ "ld1 { v28.d }[0], [x21], #0x8\n"
+ "ld1 { v27.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v27.s }[2], [x21], #0x4\n"
- "ld1 { v26.s }[2], [x20], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v27.s }[2], [x20], #0x4\n"
"b 11f\n"
"10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
"11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
"12:" // Oddments: Quantisation parameters: Store
- "str q27, [%x[outptr], #0x0]\n"
- "str q26, [%x[outptr], #0x10]\n"
+ "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x10]\n"
"add %x[outptr], %x[outptr], #0x20\n"
"13:" // End
: [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
index 5d7b54f235..7364963477 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,80 +52,80 @@ void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
{
__asm__ __volatile__(
"cmp %x[ld_weight_col], XZR\n"
- "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
- "mov z16.s, #0x9\n"
- "mov z28.b, #0x0\n"
"mov x20, #0x3\n"
- "ptrue p2.b\n"
- "mul x20, %x[ld_weight_col], x20\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "mov z16.s, #0x9\n"
+ "mov z31.b, #0x0\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "ptrue p3.b\n"
+ "mov z30.b, #0x1\n"
"cmp %x[ld_weight_row], XZR\n"
- "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
- "mov z25.b, #0x1\n"
- "mul z26.s, p2/M, z26.s, z27.s\n"
- "add x24, %x[weights], %x[ld_weight_row]\n"
- "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
- "add x23, x24, %x[ld_weight_row]\n"
- "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
- "whilelt p1.s, XZR, %x[n_channels]\n"
- "mov x21, #0x0\n"
- "mul z26.s, p2/M, z26.s, z16.s\n"
+ "mov x24, #0x0\n"
"pfalse p8.b\n"
+ "mul x20, %x[ld_weight_col], x20\n"
+ "ld1rw { z29.s }, p3/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "ld1rw { z27.s }, p3/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mul z28.s, p3/M, z28.s, z29.s\n"
+ "add x22, %x[weights], %x[ld_weight_row]\n"
+ "add x21, x22, %x[ld_weight_row]\n"
+ "mul z28.s, p3/M, z28.s, z16.s\n"
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
"2:" // Loop
- "cntp x20, p2, p1.s\n"
+ "cntp x20, p3, p2.s\n"
+ "mov z25.s, #0x0\n"
+ "and p1.b, p3/Z, p8.b, p2.b\n"
"whilelt p0.b, XZR, x20\n"
- "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
- "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
- "zip1 z20.b, z18.b, z16.b\n"
- "zip1 z19.b, z17.b, z28.b\n"
- "ld1b { z18.b }, p0/Z, [x24]\n"
- "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [x24, x22]\n"
- "zip1 z22.b, z20.b, z19.b\n"
- "zip1 z21.b, z18.b, z16.b\n"
- "zip1 z19.b, z17.b, z28.b\n"
- "mov z20.s, #0x0\n"
- "ld1b { z18.b }, p0/Z, [x23]\n"
- "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [x23, x22]\n"
- "sdot z20.s, z25.b, z22.b\n"
- "zip1 z19.b, z21.b, z19.b\n"
- "sdot z20.s, z25.b, z19.b\n"
- "zip1 z18.b, z18.b, z16.b\n"
- "zip1 z16.b, z17.b, z28.b\n"
- "and p0.b, p2/Z, p8.b, p1.b\n"
- "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
- "zip1 z16.b, z18.b, z16.b\n"
- "sdot z20.s, z25.b, z16.b\n"
- "mls z17.s, p2/M, z20.s, z27.s\n"
+ "ld1w { z24.s }, p1/Z, [%x[bias], x24, LSL #2]\n"
+ "ld1b { z19.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z18.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
"add %x[weights], %x[weights], x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add z17.s, z17.s, z26.s\n"
- "st1w { z17.s }, p2, [%x[outptr]]\n"
- "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
- "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
- "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "ld1b { z23.b }, p0/Z, [x22]\n"
+ "ld1b { z20.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+ "ld1b { z17.b }, p0/Z, [x22, x23]\n"
+ "ld1b { z22.b }, p0/Z, [x21]\n"
+ "add x22, x22, x20\n"
+ "zip1 z19.b, z19.b, z16.b\n"
+ "zip1 z18.b, z18.b, z31.b\n"
+ "ld1b { z21.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+ "add x21, x21, x20\n"
+ "zip1 z20.b, z20.b, z31.b\n"
+ "zip1 z17.b, z23.b, z17.b\n"
+ "zip1 z19.b, z19.b, z18.b\n"
+ "zip1 z18.b, z22.b, z16.b\n"
+ "zip1 z16.b, z21.b, z31.b\n"
+ "zip1 z17.b, z17.b, z20.b\n"
+ "sdot z25.s, z30.b, z19.b\n"
+ "zip1 z16.b, z18.b, z16.b\n"
+ "sdot z25.s, z30.b, z17.b\n"
+ "sdot z25.s, z30.b, z16.b\n"
+ "mls z24.s, p3/M, z25.s, z29.s\n"
+ "add z24.s, z24.s, z28.s\n"
+ "st1w { z24.s }, p3, [%x[outptr]]\n"
+ "st1b { z19.b }, p3, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z17.b }, p3, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p3, [%x[outptr], #3, MUL VL]\n"
"addvl %x[outptr], %x[outptr], #4\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [%x[rq_mul_perchannel], x24, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [%x[rq_shift_perchannel], x24, LSL #2]\n"
"3:" // Loop: Quantisation parameters: Store
- "incw x21\n"
- "whilelt p1.s, x21, %x[n_channels]\n"
- "st1w { z24.s }, p2, [%x[outptr]]\n"
- "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "incw x24\n"
+ "st1w { z27.s }, p3, [%x[outptr]]\n"
+ "st1w { z26.s }, p3, [%x[outptr], #1, MUL VL]\n"
"addvl %x[outptr], %x[outptr], #2\n"
+ "whilelt p2.s, x24, %x[n_channels]\n"
"b.any 2b\n"
: [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
index c3da81448b..e1b01663f6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,80 +52,80 @@ void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *
{
__asm__ __volatile__(
"cmp %x[ld_weight_col], XZR\n"
- "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
- "mov z16.s, #0x9\n"
- "mov z28.b, #0x0\n"
"mov x20, #0x3\n"
- "ptrue p2.b\n"
- "mul x20, %x[ld_weight_col], x20\n"
- "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
- "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "mov z16.s, #0x9\n"
+ "mov z31.b, #0x0\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "ptrue p3.b\n"
+ "mov z30.b, #0x1\n"
"cmp %x[ld_weight_row], XZR\n"
- "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
- "mov z25.b, #0x1\n"
- "mul z26.s, p2/M, z26.s, z27.s\n"
- "add x24, %x[weights], %x[ld_weight_row]\n"
- "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
- "add x23, x24, %x[ld_weight_row]\n"
- "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
- "whilelt p1.s, XZR, %x[n_channels]\n"
- "mov x21, #0x0\n"
- "mul z26.s, p2/M, z26.s, z16.s\n"
+ "mov x24, #0x0\n"
"pfalse p8.b\n"
+ "mul x20, %x[ld_weight_col], x20\n"
+ "ld1rw { z29.s }, p3/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "ld1rw { z27.s }, p3/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mul z28.s, p3/M, z28.s, z29.s\n"
+ "add x22, %x[weights], %x[ld_weight_row]\n"
+ "add x21, x22, %x[ld_weight_row]\n"
+ "mul z28.s, p3/M, z28.s, z16.s\n"
"cbz %x[bias], 1f\n"
"ptrue p8.s\n"
"1:" // No bias
"2:" // Loop
- "cntp x20, p2, p1.s\n"
+ "cntp x20, p3, p2.s\n"
+ "mov z25.s, #0x0\n"
+ "and p1.b, p3/Z, p8.b, p2.b\n"
"whilelt p0.b, XZR, x20\n"
- "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
- "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
- "zip1 z20.b, z18.b, z16.b\n"
- "zip1 z19.b, z17.b, z28.b\n"
- "ld1b { z18.b }, p0/Z, [x24]\n"
- "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [x24, x22]\n"
- "zip1 z22.b, z20.b, z19.b\n"
- "zip1 z21.b, z18.b, z16.b\n"
- "zip1 z19.b, z17.b, z28.b\n"
- "mov z20.s, #0x0\n"
- "ld1b { z18.b }, p0/Z, [x23]\n"
- "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
- "ld1b { z16.b }, p0/Z, [x23, x22]\n"
- "udot z20.s, z25.b, z22.b\n"
- "zip1 z19.b, z21.b, z19.b\n"
- "udot z20.s, z25.b, z19.b\n"
- "zip1 z18.b, z18.b, z16.b\n"
- "zip1 z16.b, z17.b, z28.b\n"
- "and p0.b, p2/Z, p8.b, p1.b\n"
- "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
- "zip1 z16.b, z18.b, z16.b\n"
- "udot z20.s, z25.b, z16.b\n"
- "mls z17.s, p2/M, z20.s, z27.s\n"
+ "ld1w { z24.s }, p1/Z, [%x[bias], x24, LSL #2]\n"
+ "ld1b { z19.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z18.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
"add %x[weights], %x[weights], x20\n"
- "add x24, x24, x20\n"
- "add x23, x23, x20\n"
- "add z17.s, z17.s, z26.s\n"
- "st1w { z17.s }, p2, [%x[outptr]]\n"
- "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
- "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
- "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "ld1b { z23.b }, p0/Z, [x22]\n"
+ "ld1b { z20.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+ "ld1b { z17.b }, p0/Z, [x22, x23]\n"
+ "ld1b { z22.b }, p0/Z, [x21]\n"
+ "add x22, x22, x20\n"
+ "zip1 z19.b, z19.b, z16.b\n"
+ "zip1 z18.b, z18.b, z31.b\n"
+ "ld1b { z21.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+ "add x21, x21, x20\n"
+ "zip1 z20.b, z20.b, z31.b\n"
+ "zip1 z17.b, z23.b, z17.b\n"
+ "zip1 z19.b, z19.b, z18.b\n"
+ "zip1 z18.b, z22.b, z16.b\n"
+ "zip1 z16.b, z21.b, z31.b\n"
+ "zip1 z17.b, z17.b, z20.b\n"
+ "udot z25.s, z30.b, z19.b\n"
+ "zip1 z16.b, z18.b, z16.b\n"
+ "udot z25.s, z30.b, z17.b\n"
+ "udot z25.s, z30.b, z16.b\n"
+ "mls z24.s, p3/M, z25.s, z29.s\n"
+ "add z24.s, z24.s, z28.s\n"
+ "st1w { z24.s }, p3, [%x[outptr]]\n"
+ "st1b { z19.b }, p3, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z17.b }, p3, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p3, [%x[outptr], #3, MUL VL]\n"
"addvl %x[outptr], %x[outptr], #4\n"
"cbz %x[rq_mul_perchannel], 3f\n"
- "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [%x[rq_mul_perchannel], x24, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [%x[rq_shift_perchannel], x24, LSL #2]\n"
"3:" // Loop: Quantisation parameters: Store
- "incw x21\n"
- "whilelt p1.s, x21, %x[n_channels]\n"
- "st1w { z24.s }, p2, [%x[outptr]]\n"
- "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "incw x24\n"
+ "st1w { z27.s }, p3, [%x[outptr]]\n"
+ "st1w { z26.s }, p3, [%x[outptr], #1, MUL VL]\n"
"addvl %x[outptr], %x[outptr], #2\n"
+ "whilelt p2.s, x24, %x[n_channels]\n"
"b.any 2b\n"
: [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
: [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
- : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index d8ca3d7437..2ef4639e18 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,144 +87,144 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x23, #0x0\n"
- "mov x22, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
"1:" // Tile loop
- "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x28, #0x2\n"
"mov x27, #0x2\n"
- "mov x26, #0x2\n"
- "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "str x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x16, #0x10\n" // cntb _, ALL, #1
"ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
"ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
- "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x15, x15, #0x1\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
- "lsr x22, %x[n_channels], #0x3\n"
- "add x11, x15, x15\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x13, x13, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x9, x13, x25, LSL #1\n"
- "mul x20, x20, x26\n" // offset *= output_tile_size
- "add x28, x9, x25, LSL #1\n"
- "add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "lsr x24, %x[n_channels], #0x3\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v27.8h }, [x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v26.8h }, [x20]\n"
- "add x27, x28, x25, LSL #1\n"
- "add x26, x11, x15\n"
- "add x25, x12, x24, LSL #1\n"
+ "mul x22, x10, x26\n" // offset = tile_i * ld_input_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x16\n"
+ "mul x20, x10, x25\n" // offset = tile_i * ld_output_row
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x9, x15, x22\n" // offset += tile_j * ld_input_col
+ "lsl x15, x15, #0x1\n"
+ "madd x20, x9, x14, x20\n" // offset += tile_j * ld_output_col
"lsl x14, x14, #0x1\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "ldr q3, [x10, #0x40]\n"
- "ldr q4, [x10, #0x50]\n"
- "ldr q5, [x10, #0x60]\n"
- "ldr q6, [x10, #0x70]\n"
- "ldr q7, [x10, #0x80]\n"
- "ldr q8, [x10, #0x90]\n"
- "add x10, x10, #0xa0\n"
- "ldr q9, [x9, x15]\n"
+ "mul x22, x22, x28\n" // offset *= kernel_stride * output_size
+ "add x10, x15, x15\n"
+ "add x9, x10, x15\n"
+ "mul x20, x20, x27\n" // offset *= output_tile_size
+ "add x13, x13, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x28, x13, x26, LSL #1\n"
+ "add x27, x28, x26, LSL #1\n"
+ "add x26, x27, x26, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x25, x12, x25, LSL #1\n"
+ "cbz x24, 4f\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q0, [x11, #0x10]\n"
+ "cmp x16, x24, LSL #4\n"
+ "ldr q1, [x11, #0x20]\n"
+ "ldr q2, [x11, #0x30]\n"
+ "ldr q3, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ "add x11, x11, #0xa0\n"
+ "ldr q9, [x28, x15]\n"
"ld1 { v10.8h }, [x13]\n"
- "ldr q11, [x13, x26]\n"
- "ldr q12, [x9, x11]\n"
- "ldr q13, [x28, x15]\n"
+ "ldr q11, [x13, x9]\n"
+ "ldr q12, [x28, x10]\n"
+ "ldr q13, [x27, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
"mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
- "add x23, x23, #0x10\n"
- "cmp x23, x22, LSL #4\n"
+ "add x16, x16, #0x10\n"
+ "add x21, x21, #0x10\n"
"mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "ld1 { v18.8h }, [x27]\n"
- "ldr q25, [x10, #0x0]\n"
+ "ld1 { v18.8h }, [x26]\n"
+ "ldr q25, [x11, #0x0]\n"
+ "cmp x16, x24, LSL #4\n"
+ "add x23, x23, #0x10\n"
"fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q20, [x28, x11]\n"
+ "ldr q20, [x27, x10]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
- "ldr q17, [x27, x26]\n"
+ "ldr q17, [x26, x9]\n"
"fmla v22.8h, v2.8h, v12.8h\n"
"fmla v21.8h, v1.8h, v12.8h\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
"fmla v24.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"ldr q16, [x13, x15]\n"
"fmla v22.8h, v6.8h, v18.8h\n"
- "ldr q18, [x13, x11]\n"
- "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x13, x10]\n"
"add x13, x13, #0x10\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"fmla v24.8h, v7.8h, v13.8h\n"
"fmla v23.8h, v6.8h, v13.8h\n"
"fmla v22.8h, v4.8h, v13.8h\n"
"fmla v21.8h, v8.8h, v17.8h\n"
- "ld1 { v17.8h }, [x9]\n"
+ "ld1 { v17.8h }, [x28]\n"
"fmla v24.8h, v1.8h, v16.8h\n"
"fmla v23.8h, v0.8h, v16.8h\n"
- "ldr q16, [x9, x26]\n"
- "add x9, x9, #0x10\n"
+ "ldr q16, [x28, x9]\n"
+ "add x28, x28, #0x10\n"
"fmla v22.8h, v5.8h, v20.8h\n"
"fmla v21.8h, v4.8h, v20.8h\n"
- "ldr q4, [x10, #0x50]\n"
+ "ldr q4, [x11, #0x50]\n"
"fmla v24.8h, v2.8h, v18.8h\n"
"fmla v23.8h, v1.8h, v18.8h\n"
- "ld1 { v19.8h }, [x28]\n"
- "ldr q1, [x10, #0x20]\n"
+ "ld1 { v19.8h }, [x27]\n"
+ "ldr q1, [x11, #0x20]\n"
"fmla v22.8h, v0.8h, v17.8h\n"
- "ldr q0, [x10, #0x10]\n"
+ "ldr q0, [x11, #0x10]\n"
"fmla v21.8h, v2.8h, v16.8h\n"
- "ldr q2, [x10, #0x30]\n"
+ "ldr q2, [x11, #0x30]\n"
"fmla v24.8h, v8.8h, v20.8h\n"
"fmla v23.8h, v7.8h, v20.8h\n"
- "ldr q18, [x28, x26]\n"
- "add x28, x28, #0x10\n"
- "ldr q13, [x28, x15]\n"
+ "ldr q18, [x27, x9]\n"
+ "add x27, x27, #0x10\n"
+ "ldr q13, [x27, x15]\n"
"fmla v22.8h, v3.8h, v19.8h\n"
"fmla v21.8h, v5.8h, v18.8h\n"
"fmla v24.8h, v3.8h, v17.8h\n"
- "ldr q17, [x27, x15]\n"
- "ldr q3, [x10, #0x40]\n"
+ "ldr q17, [x26, x15]\n"
+ "ldr q3, [x11, #0x40]\n"
"fmla v23.8h, v5.8h, v16.8h\n"
- "ldr q16, [x27, x11]\n"
- "ldr q5, [x10, #0x60]\n"
+ "ldr q16, [x26, x10]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "add x26, x26, #0x10\n"
"fmla v22.8h, v7.8h, v17.8h\n"
"fmla v21.8h, v6.8h, v17.8h\n"
- "ldr q11, [x13, x26]\n"
+ "ldr q11, [x13, x9]\n"
"fmla v24.8h, v6.8h, v19.8h\n"
- "ldr q9, [x9, x15]\n"
+ "ldr q9, [x28, x15]\n"
+ "ldr q6, [x11, #0x70]\n"
"fmla v23.8h, v8.8h, v18.8h\n"
"ld1 { v10.8h }, [x13]\n"
- "ldr q6, [x10, #0x70]\n"
"fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x11, #0x90]\n"
"fmla v21.8h, v7.8h, v16.8h\n"
- "ldr q12, [x9, x11]\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q12, [x28, x10]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "add x11, x11, #0xa0\n"
"fmax v24.8h, v24.8h, v27.8h\n"
"fmax v23.8h, v23.8h, v27.8h\n"
- "ldr q8, [x10, #0x90]\n"
"fmax v22.8h, v22.8h, v27.8h\n"
"fmax v21.8h, v21.8h, v27.8h\n"
- "add x27, x27, #0x10\n"
"fmin v24.8h, v24.8h, v26.8h\n"
"fmin v23.8h, v23.8h, v26.8h\n"
- "st1 { v24.8h }, [x12]\n"
- "add x10, x10, #0xa0\n"
"fmin v22.8h, v22.8h, v26.8h\n"
"fmin v21.8h, v21.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
"str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
"st1 { v22.8h }, [x25]\n"
@@ -236,58 +236,58 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
"mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "ld1 { v18.8h }, [x27]\n"
+ "ld1 { v18.8h }, [x26]\n"
"fmla v24.8h, v0.8h, v10.8h\n"
- "ldr q20, [x28, x11]\n"
+ "ldr q20, [x27, x10]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
- "ldr q17, [x27, x26]\n"
+ "ldr q17, [x26, x9]\n"
"fmla v22.8h, v2.8h, v12.8h\n"
"fmla v21.8h, v1.8h, v12.8h\n"
"fmla v24.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"ldr q16, [x13, x15]\n"
"fmla v22.8h, v6.8h, v18.8h\n"
- "ldr q18, [x13, x11]\n"
- "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x13, x10]\n"
"add x13, x13, #0x10\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
"fmla v24.8h, v7.8h, v13.8h\n"
"fmla v23.8h, v6.8h, v13.8h\n"
"fmla v22.8h, v4.8h, v13.8h\n"
"fmla v21.8h, v8.8h, v17.8h\n"
- "ld1 { v17.8h }, [x9]\n"
+ "ld1 { v17.8h }, [x28]\n"
"fmla v24.8h, v1.8h, v16.8h\n"
"fmla v23.8h, v0.8h, v16.8h\n"
- "ldr q16, [x9, x26]\n"
- "add x9, x9, #0x10\n"
+ "ldr q16, [x28, x9]\n"
+ "add x28, x28, #0x10\n"
"fmla v22.8h, v5.8h, v20.8h\n"
"fmla v21.8h, v4.8h, v20.8h\n"
"fmla v24.8h, v2.8h, v18.8h\n"
"fmla v23.8h, v1.8h, v18.8h\n"
- "ld1 { v19.8h }, [x28]\n"
+ "ld1 { v19.8h }, [x27]\n"
"fmla v22.8h, v0.8h, v17.8h\n"
"fmla v21.8h, v2.8h, v16.8h\n"
"fmla v24.8h, v8.8h, v20.8h\n"
"fmla v23.8h, v7.8h, v20.8h\n"
- "ldr q18, [x28, x26]\n"
- "add x28, x28, #0x10\n"
+ "ldr q18, [x27, x9]\n"
+ "add x27, x27, #0x10\n"
"fmla v22.8h, v3.8h, v19.8h\n"
"fmla v21.8h, v5.8h, v18.8h\n"
"fmla v24.8h, v3.8h, v17.8h\n"
- "ldr q17, [x27, x15]\n"
+ "ldr q17, [x26, x15]\n"
"fmla v23.8h, v5.8h, v16.8h\n"
- "ldr q16, [x27, x11]\n"
+ "ldr q16, [x26, x10]\n"
+ "add x26, x26, #0x10\n"
"fmla v22.8h, v7.8h, v17.8h\n"
"fmla v21.8h, v6.8h, v17.8h\n"
- "add x27, x27, #0x10\n"
"fmla v24.8h, v6.8h, v19.8h\n"
"fmla v23.8h, v8.8h, v18.8h\n"
- "fmax v24.8h, v24.8h, v27.8h\n"
"fmla v22.8h, v8.8h, v16.8h\n"
"fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
"fmax v23.8h, v23.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
"fmax v22.8h, v22.8h, v27.8h\n"
"fmax v21.8h, v21.8h, v27.8h\n"
- "fmin v24.8h, v24.8h, v26.8h\n"
"fmin v23.8h, v23.8h, v26.8h\n"
"st1 { v24.8h }, [x12]\n"
"fmin v22.8h, v22.8h, v26.8h\n"
@@ -300,21 +300,21 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 57f\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
- "add x24, x9, x15\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q0, [x11, #0x10]\n"
+ "add x24, x28, x15\n"
"add x23, x13, XZR\n"
- "ldr q1, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "add x22, x13, x26\n"
- "add x21, x9, x11\n"
- "ldr q3, [x10, #0x40]\n"
- "ldr q4, [x10, #0x50]\n"
- "add x20, x28, x15\n"
- "ldr q5, [x10, #0x60]\n"
- "ldr q6, [x10, #0x70]\n"
- "ldr q7, [x10, #0x80]\n"
- "ldr q8, [x10, #0x90]\n"
+ "ldr q1, [x11, #0x20]\n"
+ "ldr q2, [x11, #0x30]\n"
+ "add x22, x13, x9\n"
+ "add x21, x28, x10\n"
+ "ldr q3, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ "add x20, x27, x15\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
"ldr d9, [x24], #0x8\n"
"ldr d10, [x23], #0x8\n"
@@ -365,15 +365,15 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"8:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
"mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "add x20, x27, XZR\n"
+ "add x20, x26, XZR\n"
"mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -396,10 +396,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"12:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v6.8h, v9.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x27, x26\n"
+ "add x20, x26, x9\n"
"fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
@@ -444,7 +444,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "add x20, x13, x11\n"
+ "add x20, x13, x10\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -467,7 +467,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "add x20, x28, x11\n"
+ "add x20, x27, x10\n"
"tbz %x[n_channels], #2, 26f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
@@ -490,7 +490,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x9, XZR\n"
+ "add x20, x28, XZR\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
"tbz %x[n_channels], #2, 30f\n"
@@ -515,7 +515,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x20, x9, x26\n"
+ "add x20, x28, x9\n"
"tbz %x[n_channels], #2, 34f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
@@ -538,7 +538,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v29.8h, v5.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "add x20, x28, XZR\n"
+ "add x20, x27, XZR\n"
"tbz %x[n_channels], #2, 38f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -561,7 +561,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"40:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x20, x28, x26\n"
+ "add x20, x27, x9\n"
"tbz %x[n_channels], #2, 42f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -584,7 +584,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v29.8h, v8.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
- "add x20, x27, x15\n"
+ "add x20, x26, x15\n"
"tbz %x[n_channels], #2, 46f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
@@ -607,7 +607,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x20, x27, x11\n"
+ "add x20, x26, x10\n"
"tbz %x[n_channels], #2, 50f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
@@ -632,28 +632,28 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmla v31.8h, v7.8h, v12.8h\n"
"fmax v28.8h, v28.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
"fmax v31.8h, v31.8h, v27.8h\n"
- "fmin v28.8h, v28.8h, v26.8h\n"
"fmin v29.8h, v29.8h, v26.8h\n"
"fmin v30.8h, v30.8h, v26.8h\n"
"fmin v31.8h, v31.8h, v26.8h\n"
"tbz %x[n_channels], #2, 54f\n"
"mov x21, x12\n"
"mov x20, x25\n"
- "st1 { v28.d }[0], [x21], x14\n"
- "st1 { v30.d }[0], [x20], x14\n"
"add x12, x12, #0x8\n"
"add x25, x25, #0x8\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 53f\n"
"mov x21, x12\n"
"mov x20, x25\n"
- "st1 { v28.s }[2], [x21], x14\n"
- "st1 { v30.s }[2], [x20], x14\n"
"add x12, x12, #0x4\n"
"add x25, x25, #0x4\n"
+ "st1 { v28.s }[2], [x21], x14\n"
+ "st1 { v30.s }[2], [x20], x14\n"
"st1 { v29.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 56f\n"
@@ -677,10 +677,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"tbz %x[n_channels], #1, 55f\n"
"mov x21, x12\n"
"mov x20, x25\n"
- "st1 { v28.s }[0], [x21], x14\n"
- "st1 { v30.s }[0], [x20], x14\n"
"add x12, x12, #0x4\n"
"add x25, x25, #0x4\n"
+ "st1 { v28.s }[0], [x21], x14\n"
+ "st1 { v30.s }[0], [x20], x14\n"
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 56f\n"
@@ -700,20 +700,20 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.h }[0], [x20]\n"
"56:" // Tile loop: Oddments: Store: Bit 2: End
"57:" // Tile loop: End
- "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x22, x22, #0x1\n"
- "add x21, x23, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x22, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x23, x23, x21, LT\n"
- "csel x22, x22, XZR, LT\n"
- "cmp x23, x20\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x9, x9, #0x1\n"
+ "add x20, x10, #0x1\n"
+ "cmp x9, x22\n"
+ "csel x10, x10, x20, LT\n"
+ "csel x9, x9, XZR, LT\n"
+ "cmp x10, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index c9a554e9ad..90da1a803e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,237 +78,237 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x16, #0x10\n" // cntb _, ALL, #1
- "lsr x15, %x[n_channels], #0x3\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v27.8h }, [x20]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x16, %x[n_channels], #0x3\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.8h }, [x21]\n"
"ld1r { v26.8h }, [x20]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x12, x11, [x21, #0x0]\n"
- "ldp x10, x9, [x21, #0x10]\n"
- "mov x28, #0x0\n"
- "sub x27, XZR, x16\n"
- "cbz x15, 3f\n"
- "ldr q25, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "cmp x16, x15, LSL #4\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x28]\n"
- "ldr q10, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "ldr q11, [x21, x28]\n"
- "ldr q12, [x20, x28]\n"
- "ldr x20, [x13, #0x20]\n"
- "ldr q13, [x20, x28]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x22, #0x0]\n"
+ "ldp x10, x9, [x22, #0x10]\n"
+ "sub x28, XZR, x17\n"
+ "cbz x16, 3f\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x17, x16, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x24, x23, [x14, #0x0]\n"
+ "ldp x22, x21, [x14, #0x10]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "ldr q9, [x24, x13]\n"
+ "ldr q10, [x23, x13]\n"
+ "ldr q11, [x22, x13]\n"
+ "ldr q12, [x21, x13]\n"
+ "ldr q13, [x20, x13]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
"mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "ldr x21, [x14, #0x30]\n"
"mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "ldr q18, [x21, x28]\n"
- "ldr q25, [x14, #0x0]\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr x24, [x14, #0x38]\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q18, [x22, x13]\n"
+ "ldr x22, [x14, #0x50]\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"fmla v23.8h, v2.8h, v11.8h\n"
- "ldr q17, [x20, x28]\n"
- "ldr x21, [x13, #0x38]\n"
+ "ldr q17, [x21, x13]\n"
+ "ldr x21, [x14, #0x58]\n"
+ "ldr q20, [x20, x13]\n"
"fmla v22.8h, v2.8h, v12.8h\n"
"fmla v21.8h, v1.8h, v12.8h\n"
- "ldr x20, [x13, #0x48]\n"
- "ldr q20, [x20, x28]\n"
+ "ldr x20, [x14, #0x60]\n"
+ "ldr x27, [x14, #0x68]\n"
+ "ldr x26, [x14, #0x70]\n"
"fmla v24.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x20, [x13, #0x40]\n"
+ "ldr q16, [x24, x13]\n"
+ "ldr x25, [x14, #0x78]\n"
"fmla v22.8h, v6.8h, v18.8h\n"
- "ldr q18, [x20, x28]\n"
+ "ldr q18, [x23, x13]\n"
+ "ldp x24, x23, [x14, #0x0]\n"
"fmla v21.8h, v3.8h, v13.8h\n"
- "ldr x20, [x13, #0x50]\n"
"fmla v24.8h, v7.8h, v13.8h\n"
"fmla v23.8h, v6.8h, v13.8h\n"
- "ldr x22, [x13, #0x58]\n"
- "ldr x21, [x13, #0x60]\n"
"fmla v22.8h, v4.8h, v13.8h\n"
"fmla v21.8h, v8.8h, v17.8h\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x68]\n"
+ "ldr q17, [x22, x13]\n"
"fmla v24.8h, v1.8h, v16.8h\n"
"fmla v23.8h, v0.8h, v16.8h\n"
- "ldr q16, [x22, x28]\n"
- "ldr x26, [x13, #0x70]\n"
+ "ldr q16, [x21, x13]\n"
+ "ldp x22, x21, [x14, #0x10]\n"
"fmla v22.8h, v5.8h, v20.8h\n"
"fmla v21.8h, v4.8h, v20.8h\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr x25, [x13, #0x78]\n"
+ "ldr q4, [x15, #0x50]\n"
"fmla v24.8h, v2.8h, v18.8h\n"
"fmla v23.8h, v1.8h, v18.8h\n"
- "ldr q19, [x21, x28]\n"
- "ldr q1, [x14, #0x20]\n"
+ "ldr q19, [x20, x13]\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr x20, [x14, #0x20]\n"
"fmla v22.8h, v0.8h, v17.8h\n"
- "ldr q0, [x14, #0x10]\n"
+ "ldr q0, [x15, #0x10]\n"
"fmla v21.8h, v2.8h, v16.8h\n"
- "ldr q2, [x14, #0x30]\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v24.8h, v8.8h, v20.8h\n"
+ "ldr q13, [x20, x17]\n"
"fmla v23.8h, v7.8h, v20.8h\n"
- "ldr q18, [x20, x28]\n"
- "ldp x24, x23, [x13, #0x0]\n"
+ "ldr q18, [x27, x13]\n"
"fmla v22.8h, v3.8h, v19.8h\n"
"fmla v21.8h, v5.8h, v18.8h\n"
- "ldp x22, x21, [x13, #0x10]\n"
- "ldr x20, [x13, #0x20]\n"
- "ldr q13, [x20, x16]\n"
"fmla v24.8h, v3.8h, v17.8h\n"
- "ldr q17, [x26, x28]\n"
+ "ldr q17, [x26, x13]\n"
+ "ldr q3, [x15, #0x40]\n"
"fmla v23.8h, v5.8h, v16.8h\n"
- "ldr q16, [x25, x28]\n"
- "ldr q3, [x14, #0x40]\n"
+ "ldr q16, [x25, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "add x13, x13, #0x10\n"
"fmla v22.8h, v7.8h, v17.8h\n"
"fmla v21.8h, v6.8h, v17.8h\n"
- "ldr q11, [x22, x16]\n"
- "ldr q5, [x14, #0x60]\n"
+ "ldr q11, [x22, x17]\n"
"fmla v24.8h, v6.8h, v19.8h\n"
+ "ldr q9, [x24, x17]\n"
+ "ldr q6, [x15, #0x70]\n"
"fmla v23.8h, v8.8h, v18.8h\n"
- "ldr q9, [x24, x16]\n"
- "ldr q10, [x23, x16]\n"
+ "ldr q10, [x23, x17]\n"
"fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x15, #0x90]\n"
"fmla v21.8h, v7.8h, v16.8h\n"
- "ldr q12, [x21, x16]\n"
- "ldr q6, [x14, #0x70]\n"
+ "ldr q12, [x21, x17]\n"
+ "add x17, x17, #0x10\n"
+ "ldr q7, [x15, #0x80]\n"
+ "cmp x17, x16, LSL #4\n"
+ "add x15, x15, #0xa0\n"
"fmax v24.8h, v24.8h, v27.8h\n"
"fmax v23.8h, v23.8h, v27.8h\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
"fmax v22.8h, v22.8h, v27.8h\n"
"fmax v21.8h, v21.8h, v27.8h\n"
- "add x16, x16, #0x10\n"
- "add x27, x27, #0x10\n"
"fmin v24.8h, v24.8h, v26.8h\n"
"fmin v23.8h, v23.8h, v26.8h\n"
- "cmp x16, x15, LSL #4\n"
"fmin v22.8h, v22.8h, v26.8h\n"
"fmin v21.8h, v21.8h, v26.8h\n"
- "add x28, x28, #0x10\n"
- "str q24, [x12, x27]\n"
- "add x14, x14, #0xa0\n"
- "str q23, [x11, x27]\n"
- "str q22, [x10, x27]\n"
- "str q21, [x9, x27]\n"
+ "str q24, [x12, x28]\n"
+ "str q23, [x11, x28]\n"
+ "str q22, [x10, x28]\n"
+ "str q21, [x9, x28]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
"mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "ldr x21, [x14, #0x30]\n"
"mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
"mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "ldr q18, [x21, x28]\n"
- "ldr x21, [x13, #0x38]\n"
+ "ldr x27, [x14, #0x38]\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x26, [x14, #0x40]\n"
+ "ldr x25, [x14, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q18, [x22, x13]\n"
+ "ldr x24, [x14, #0x58]\n"
"fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x20, x13]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x48]\n"
- "ldr q20, [x20, x28]\n"
+ "ldr q17, [x21, x13]\n"
"fmla v22.8h, v2.8h, v12.8h\n"
"fmla v21.8h, v1.8h, v12.8h\n"
- "ldr x20, [x13, #0x40]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "ldr x21, [x14, #0x70]\n"
"fmla v24.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x21, [x13, #0x50]\n"
+ "ldr q16, [x27, x13]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v22.8h, v6.8h, v18.8h\n"
- "ldr q18, [x20, x28]\n"
+ "ldr q18, [x26, x13]\n"
"fmla v21.8h, v3.8h, v13.8h\n"
- "ldr x20, [x13, #0x58]\n"
"fmla v24.8h, v7.8h, v13.8h\n"
"fmla v23.8h, v6.8h, v13.8h\n"
- "ldr x23, [x13, #0x60]\n"
- "ldr x22, [x13, #0x68]\n"
"fmla v22.8h, v4.8h, v13.8h\n"
"fmla v21.8h, v8.8h, v17.8h\n"
- "ldr q17, [x21, x28]\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x25, x13]\n"
"fmla v24.8h, v1.8h, v16.8h\n"
"fmla v23.8h, v0.8h, v16.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0x78]\n"
+ "ldr q16, [x24, x13]\n"
"fmla v22.8h, v5.8h, v20.8h\n"
"fmla v21.8h, v4.8h, v20.8h\n"
- "add x27, x27, #0x10\n"
"fmla v24.8h, v2.8h, v18.8h\n"
"fmla v23.8h, v1.8h, v18.8h\n"
- "ldr q19, [x23, x28]\n"
+ "ldr q19, [x23, x13]\n"
"fmla v22.8h, v0.8h, v17.8h\n"
"fmla v21.8h, v2.8h, v16.8h\n"
"fmla v24.8h, v8.8h, v20.8h\n"
"fmla v23.8h, v7.8h, v20.8h\n"
- "ldr q18, [x22, x28]\n"
+ "ldr q18, [x22, x13]\n"
"fmla v22.8h, v3.8h, v19.8h\n"
"fmla v21.8h, v5.8h, v18.8h\n"
"fmla v24.8h, v3.8h, v17.8h\n"
- "ldr q17, [x21, x28]\n"
+ "ldr q17, [x21, x13]\n"
"fmla v23.8h, v5.8h, v16.8h\n"
- "ldr q16, [x20, x28]\n"
+ "ldr q16, [x20, x13]\n"
+ "add x13, x13, #0x10\n"
"fmla v22.8h, v7.8h, v17.8h\n"
"fmla v21.8h, v6.8h, v17.8h\n"
- "add x28, x28, #0x10\n"
"fmla v24.8h, v6.8h, v19.8h\n"
"fmla v23.8h, v8.8h, v18.8h\n"
- "fmax v24.8h, v24.8h, v27.8h\n"
"fmla v22.8h, v8.8h, v16.8h\n"
"fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
"fmax v23.8h, v23.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
"fmax v22.8h, v22.8h, v27.8h\n"
"fmax v21.8h, v21.8h, v27.8h\n"
- "fmin v24.8h, v24.8h, v26.8h\n"
"fmin v23.8h, v23.8h, v26.8h\n"
- "str q24, [x12, x27]\n"
+ "str q24, [x12, x28]\n"
"fmin v22.8h, v22.8h, v26.8h\n"
"fmin v21.8h, v21.8h, v26.8h\n"
- "str q23, [x11, x27]\n"
- "str q22, [x10, x27]\n"
- "str q21, [x9, x27]\n"
+ "str q23, [x11, x28]\n"
+ "str q22, [x10, x28]\n"
+ "str q21, [x9, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 56f\n"
- "ldr q25, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "mov x20, x28\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x20, x13\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
"add x12, x12, x20\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
"add x11, x11, x20\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
"add x10, x10, x20\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
"add x9, x9, x20\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "ldr x24, [x13, #0x0]\n"
- "ldr x23, [x13, #0x8]\n"
- "add x24, x24, x28\n"
- "add x23, x23, x28\n"
- "ldr x22, [x13, #0x10]\n"
- "ldr x21, [x13, #0x18]\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "ldr x20, [x13, #0x20]\n"
- "add x20, x20, x28\n"
+ "ldr x24, [x14, #0x0]\n"
+ "ldr x23, [x14, #0x8]\n"
+ "ldr x22, [x14, #0x10]\n"
+ "ldr x21, [x14, #0x18]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -359,16 +359,16 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"7:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
"mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
"mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
- "ldr x20, [x13, #0x28]\n"
- "add x20, x20, x28\n"
+ "ldr x20, [x14, #0x28]\n"
"mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "add x20, x20, x13\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v12.8h\n"
- "fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
"tbz %x[n_channels], #2, 9f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
@@ -390,12 +390,12 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (3, 0): Bit 2: End
"fmla v30.8h, v6.8h, v9.8h\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x20, [x14, #0x30]\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x20, x28\n"
"fmla v29.8h, v6.8h, v13.8h\n"
- "fmla v30.8h, v4.8h, v13.8h\n"
"fmla v31.8h, v3.8h, v13.8h\n"
+ "add x20, x20, x13\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -416,9 +416,9 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x13, #0x38]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla v31.8h, v8.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -439,10 +439,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (0, 1): Bit 2: End
- "ldr x20, [x13, #0x40]\n"
+ "ldr x20, [x14, #0x40]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
"fmla v29.8h, v0.8h, v12.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -463,10 +463,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (0, 2): Bit 2: End
- "ldr x20, [x13, #0x48]\n"
+ "ldr x20, [x14, #0x48]\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -487,12 +487,12 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 2): Bit 2: End
- "ldr x20, [x13, #0x50]\n"
+ "ldr x20, [x14, #0x50]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x20, x28\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -513,10 +513,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (1, 0): Bit 2: End
- "ldr x20, [x13, #0x58]\n"
+ "ldr x20, [x14, #0x58]\n"
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -537,10 +537,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x13, #0x60]\n"
+ "ldr x20, [x14, #0x60]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
"fmla v31.8h, v2.8h, v12.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -561,10 +561,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v9.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x13, #0x68]\n"
+ "ldr x20, [x14, #0x68]\n"
"fmla v28.8h, v6.8h, v9.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -585,10 +585,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v10.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x20, [x13, #0x70]\n"
+ "ldr x20, [x14, #0x70]\n"
"fmla v29.8h, v8.8h, v10.8h\n"
"fmla v31.8h, v5.8h, v10.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -609,10 +609,10 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x13, #0x78]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v30.8h, v7.8h, v11.8h\n"
"fmla v31.8h, v6.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -637,9 +637,9 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v31.8h, v7.8h, v12.8h\n"
"fmax v28.8h, v28.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
"fmax v31.8h, v31.8h, v27.8h\n"
- "fmin v28.8h, v28.8h, v26.8h\n"
"fmin v29.8h, v29.8h, v26.8h\n"
"fmin v30.8h, v30.8h, v26.8h\n"
"fmin v31.8h, v31.8h, v26.8h\n"
@@ -687,7 +687,7 @@ void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"56:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 4e64a2bf2b..778a95072a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,52 +87,52 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x24, #0x0\n"
- "mov x23, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
"1:" // Tile loop
- "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x28, #0x3\n"
"mov x27, #0x3\n"
- "mov x26, #0x3\n"
- "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
- "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
- "mov x24, #0x10\n" // cntb _, ALL, #1
- "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
- "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x8, x8, #0x1\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
- "lsl x17, x17, #0x1\n"
- "lsr x23, %x[n_channels], #0x3\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x16, x16, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x13, x16, x25, LSL #1\n"
- "mul x20, x20, x26\n" // offset *= output_tile_size
- "add x12, x13, x25, LSL #1\n"
- "add x11, x8, x8\n"
- "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "add x10, x12, x25, LSL #1\n"
- "add x9, x11, x8\n"
- "add x28, x15, x22, LSL #1\n"
+ "str x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsr x17, %x[n_channels], #0x3\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v15.8h }, [x20]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x24, #0x0\n"
"ld1r { v14.8h }, [x20]\n"
- "add x27, x10, x25, LSL #1\n"
- "add x26, x9, x8\n"
- "add x25, x28, x22, LSL #1\n"
- "add x22, x17, x17\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x24\n"
- "cbz x23, 4f\n"
+ "mul x23, x10, x26\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x22, XZR, x6\n"
+ "mul x21, x10, x25\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x23, x9, x7, x23\n" // offset += tile_j * ld_input_col
+ "lsl x7, x7, #0x1\n"
+ "madd x21, x9, x8, x21\n" // offset += tile_j * ld_output_col
+ "lsl x8, x8, #0x1\n"
+ "mul x23, x23, x28\n" // offset *= kernel_stride * output_size
+ "add x13, x7, x7\n"
+ "add x12, x13, x7\n"
+ "add x11, x12, x7\n"
+ "mul x21, x21, x27\n" // offset *= output_tile_size
+ "add x20, x8, x8\n"
+ "add x16, x16, x23, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x10, x16, x26, LSL #1\n"
+ "add x9, x10, x26, LSL #1\n"
+ "add x15, x15, x21, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x28, x9, x26, LSL #1\n"
+ "add x27, x15, x25, LSL #1\n"
+ "add x26, x28, x26, LSL #1\n"
+ "add x25, x27, x25, LSL #1\n"
+ "cbz x17, 4f\n"
"ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "cmp x24, x23, LSL #4\n"
+ "cmp x6, x17, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
"ldr q3, [x14, #0x40]\n"
@@ -142,321 +142,321 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldr q9, [x12, x11]\n"
+ "ldr q9, [x9, x13]\n"
"ld1 { v10.8h }, [x16]\n"
- "ldr q11, [x16, x26]\n"
- "ld1 { v12.8h }, [x27]\n"
- "ldr q13, [x13, x11]\n"
+ "ldr q11, [x16, x11]\n"
+ "ld1 { v12.8h }, [x26]\n"
+ "ldr q13, [x10, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
- "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "add x6, x6, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "cmp x6, x17, LSL #4\n"
"add x24, x24, #0x10\n"
- "cmp x24, x23, LSL #4\n"
- "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v4.8h, v13.8h\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
- "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
- "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q23, [x12, x9]\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "ldr q18, [x12, x8]\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v6.8h, v18.8h\n"
- "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v27.8h, v3.8h, v13.8h\n"
- "fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v25.8h, v1.8h, v13.8h\n"
- "fmla v24.8h, v0.8h, v13.8h\n"
- "ldr q17, [x16, x8]\n"
- "fmla v22.8h, v6.8h, v12.8h\n"
- "ldr q16, [x27, x26]\n"
- "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x9, x12]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x9, x7]\n"
+ "fmla v27.8h, v2.8h, v13.8h\n"
+ "fmla v26.8h, v1.8h, v13.8h\n"
+ "fmla v25.8h, v0.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x26, x11]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
"ldr q31, [x14, #0x0]\n"
- "fmla v29.8h, v0.8h, v17.8h\n"
- "fmla v21.8h, v8.8h, v16.8h\n"
- "ldr q16, [x16, x9]\n"
- "fmla v28.8h, v7.8h, v18.8h\n"
- "fmla v20.8h, v0.8h, v18.8h\n"
- "fmla v26.8h, v4.8h, v18.8h\n"
- "fmla v25.8h, v3.8h, v18.8h\n"
- "fmla v22.8h, v1.8h, v18.8h\n"
- "ld1 { v19.8h }, [x13]\n"
- "fmla v29.8h, v2.8h, v16.8h\n"
- "fmla v27.8h, v1.8h, v16.8h\n"
- "ld1 { v18.8h }, [x10]\n"
- "fmla v24.8h, v4.8h, v23.8h\n"
- "fmla v28.8h, v1.8h, v17.8h\n"
- "ldr q16, [x13, x26]\n"
- "fmla v20.8h, v2.8h, v23.8h\n"
- "fmla v21.8h, v1.8h, v23.8h\n"
- "fmla v29.8h, v8.8h, v23.8h\n"
- "fmla v27.8h, v7.8h, v23.8h\n"
- "fmla v25.8h, v5.8h, v23.8h\n"
- "ldr q17, [x10, x11]\n"
+ "fmla v30.8h, v6.8h, v17.8h\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x16, x7]\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x12]\n"
+ "fmla v26.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "fmla v30.8h, v0.8h, v18.8h\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ld1 { v20.8h }, [x10]\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v26.8h, v5.8h, v22.8h\n"
+ "fmla v21.8h, v2.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v20.8h\n"
+ "fmla v30.8h, v2.8h, v16.8h\n"
+ "ld1 { v17.8h }, [x28]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q16, [x10, x11]\n"
+ "fmla v28.8h, v7.8h, v22.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v6.8h, v17.8h\n"
+ "ldr q19, [x10, x7]\n"
+ "fmla v30.8h, v8.8h, v22.8h\n"
+ "ldr q18, [x28, x13]\n"
+ "fmla v29.8h, v3.8h, v20.8h\n"
+ "ldr q17, [x28, x11]\n"
+ "fmla v28.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x26, x7]\n"
+ "fmla v21.8h, v4.8h, v18.8h\n"
+ "fmla v23.8h, v3.8h, v18.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v24.8h, v5.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v27.8h, v8.8h, v18.8h\n"
+ "fmla v30.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v29.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v5.8h, v17.8h\n"
"fmla v26.8h, v0.8h, v19.8h\n"
- "fmla v22.8h, v3.8h, v18.8h\n"
- "fmla v24.8h, v2.8h, v16.8h\n"
- "fmla v20.8h, v4.8h, v17.8h\n"
- "fmla v21.8h, v3.8h, v17.8h\n"
- "fmla v28.8h, v3.8h, v19.8h\n"
- "ldr q19, [x10, x26]\n"
- "fmla v27.8h, v5.8h, v16.8h\n"
- "ldr q16, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v18.8h\n"
- "ldr q18, [x13, x8]\n"
- "fmla v25.8h, v7.8h, v17.8h\n"
- "fmla v22.8h, v5.8h, v17.8h\n"
- "fmla v24.8h, v6.8h, v17.8h\n"
- "fmla v21.8h, v5.8h, v19.8h\n"
- "fmla v20.8h, v6.8h, v16.8h\n"
- "fmla v26.8h, v8.8h, v17.8h\n"
- "fmla v22.8h, v7.8h, v16.8h\n"
- "ldr q17, [x27, x9]\n"
- "fmla v29.8h, v3.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v18.8h\n"
- "fmla v24.8h, v8.8h, v19.8h\n"
- "ldr q16, [x13, x9]\n"
- "fmla v20.8h, v8.8h, v17.8h\n"
- "add x13, x13, #0x10\n"
- "fmla v21.8h, v7.8h, v17.8h\n"
- "ldr q19, [x10, x9]\n"
- "fmla v28.8h, v4.8h, v18.8h\n"
- "fmla v26.8h, v1.8h, v18.8h\n"
- "ldr q17, [x10, x8]\n"
- "fmla v29.8h, v5.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "ldr q18, [x26, x12]\n"
+ "fmla v25.8h, v8.8h, v17.8h\n"
+ "ldr q16, [x10, x12]\n"
+ "fmla v27.8h, v1.8h, v19.8h\n"
+ "ldr q17, [x28, x7]\n"
"add x10, x10, #0x10\n"
- "fmla v27.8h, v4.8h, v16.8h\n"
- "fmla v25.8h, v2.8h, v16.8h\n"
- "fmla v24.8h, v1.8h, v16.8h\n"
- "ldr q16, [x16, x11]\n"
- "fmla v22.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v18.8h\n"
+ "fmla v23.8h, v7.8h, v18.8h\n"
+ "ldr q19, [x28, x12]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v2.8h, v16.8h\n"
+ "fmla v25.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x13]\n"
+ "fmla v24.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
"ld1 { v10.8h }, [x16]\n"
- "fmla v20.8h, v3.8h, v17.8h\n"
- "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v27.8h, v7.8h, v17.8h\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.8h, v7.8h, v17.8h\n"
- "fmla v25.8h, v6.8h, v17.8h\n"
- "ld1 { v18.8h }, [x12]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v26.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x9]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v30.8h, v1.8h, v16.8h\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmla v27.8h, v0.8h, v16.8h\n"
- "ldr q17, [x12, x26]\n"
- "fmla v24.8h, v7.8h, v19.8h\n"
- "add x12, x12, #0x10\n"
- "ldr q9, [x12, x11]\n"
- "fmla v20.8h, v5.8h, v19.8h\n"
- "fmla v22.8h, v0.8h, v18.8h\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x9, x11]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "add x9, x9, #0x10\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v24.8h, v0.8h, v18.8h\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v21.8h, v2.8h, v17.8h\n"
- "ldr q2, [x14, #0x30]\n"
- "fmla v25.8h, v8.8h, v19.8h\n"
- "ldr q16, [x27, x11]\n"
- "fmla v28.8h, v6.8h, v18.8h\n"
- "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x26, x13]\n"
+ "fmla v27.8h, v3.8h, v18.8h\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmla v27.8h, v8.8h, v17.8h\n"
- "fmla v24.8h, v5.8h, v17.8h\n"
- "ldr q11, [x16, x26]\n"
+ "fmla v23.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmla v28.8h, v8.8h, v17.8h\n"
+ "fmla v25.8h, v5.8h, v17.8h\n"
+ "ldr q11, [x16, x11]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v21.8h, v6.8h, v16.8h\n"
- "ldr q13, [x13, x11]\n"
- "ldr q6, [x14, #0x70]\n"
"fmax v27.8h, v27.8h, v15.8h\n"
"fmax v26.8h, v26.8h, v15.8h\n"
+ "add x26, x26, #0x10\n"
+ "ld1 { v12.8h }, [x26]\n"
+ "fmla v23.8h, v6.8h, v16.8h\n"
+ "ldr q13, [x10, x13]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
"fmax v25.8h, v25.8h, v15.8h\n"
- "add x27, x27, #0x10\n"
- "ld1 { v12.8h }, [x27]\n"
- "fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
"add x14, x14, #0xa0\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
"fmax v21.8h, v21.8h, v15.8h\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "st1 { v28.8h }, [x15]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
"fmin v26.8h, v26.8h, v14.8h\n"
- "str q29, [x15, x17]\n"
"fmin v25.8h, v25.8h, v14.8h\n"
"fmin v24.8h, v24.8h, v14.8h\n"
- "str q27, [x15, x22]\n"
- "add x15, x15, #0x10\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "st1 { v26.8h }, [x28]\n"
+ "st1 { v29.8h }, [x15]\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q25, [x28, x17]\n"
- "str q24, [x28, x22]\n"
- "add x28, x28, #0x10\n"
- "st1 { v22.8h }, [x25]\n"
- "str q20, [x25, x17]\n"
- "str q21, [x25, x22]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "str q30, [x15, x8]\n"
+ "str q28, [x15, x20]\n"
+ "add x15, x15, #0x10\n"
+ "st1 { v27.8h }, [x27]\n"
+ "str q26, [x27, x8]\n"
+ "str q25, [x27, x20]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v24.8h }, [x25]\n"
+ "str q21, [x25, x8]\n"
+ "str q23, [x25, x20]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
- "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
- "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr q23, [x12, x9]\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "ldr q18, [x12, x8]\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v6.8h, v18.8h\n"
- "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "fmla v28.8h, v5.8h, v13.8h\n"
- "fmla v27.8h, v3.8h, v13.8h\n"
- "fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v25.8h, v1.8h, v13.8h\n"
- "fmla v24.8h, v0.8h, v13.8h\n"
- "ldr q17, [x16, x8]\n"
- "fmla v22.8h, v6.8h, v12.8h\n"
- "ldr q16, [x27, x26]\n"
- "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
- "fmla v29.8h, v0.8h, v17.8h\n"
- "fmla v21.8h, v8.8h, v16.8h\n"
- "ldr q16, [x16, x9]\n"
- "fmla v28.8h, v7.8h, v18.8h\n"
- "fmla v20.8h, v0.8h, v18.8h\n"
- "fmla v26.8h, v4.8h, v18.8h\n"
- "fmla v25.8h, v3.8h, v18.8h\n"
- "fmla v22.8h, v1.8h, v18.8h\n"
- "ld1 { v19.8h }, [x13]\n"
- "fmla v29.8h, v2.8h, v16.8h\n"
- "fmla v27.8h, v1.8h, v16.8h\n"
- "ld1 { v18.8h }, [x10]\n"
- "fmla v24.8h, v4.8h, v23.8h\n"
- "fmla v28.8h, v1.8h, v17.8h\n"
- "ldr q16, [x13, x26]\n"
- "fmla v20.8h, v2.8h, v23.8h\n"
- "fmla v21.8h, v1.8h, v23.8h\n"
- "fmla v29.8h, v8.8h, v23.8h\n"
- "fmla v27.8h, v7.8h, v23.8h\n"
- "fmla v25.8h, v5.8h, v23.8h\n"
- "ldr q17, [x10, x11]\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x9, x12]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x9, x7]\n"
+ "fmla v27.8h, v2.8h, v13.8h\n"
+ "fmla v26.8h, v1.8h, v13.8h\n"
+ "fmla v25.8h, v0.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x26, x11]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v6.8h, v17.8h\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x16, x7]\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x12]\n"
+ "fmla v26.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "fmla v30.8h, v0.8h, v18.8h\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ld1 { v20.8h }, [x10]\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v26.8h, v5.8h, v22.8h\n"
+ "fmla v21.8h, v2.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v20.8h\n"
+ "fmla v30.8h, v2.8h, v16.8h\n"
+ "ld1 { v17.8h }, [x28]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q16, [x10, x11]\n"
+ "fmla v28.8h, v7.8h, v22.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v6.8h, v17.8h\n"
+ "ldr q19, [x10, x7]\n"
+ "fmla v30.8h, v8.8h, v22.8h\n"
+ "ldr q18, [x28, x13]\n"
+ "fmla v29.8h, v3.8h, v20.8h\n"
+ "ldr q17, [x28, x11]\n"
+ "fmla v28.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x26, x7]\n"
+ "fmla v21.8h, v4.8h, v18.8h\n"
+ "fmla v23.8h, v3.8h, v18.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v24.8h, v5.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v27.8h, v8.8h, v18.8h\n"
+ "fmla v30.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v29.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v5.8h, v17.8h\n"
"fmla v26.8h, v0.8h, v19.8h\n"
- "fmla v22.8h, v3.8h, v18.8h\n"
- "fmla v24.8h, v2.8h, v16.8h\n"
- "fmla v20.8h, v4.8h, v17.8h\n"
- "fmla v21.8h, v3.8h, v17.8h\n"
- "fmla v28.8h, v3.8h, v19.8h\n"
- "ldr q19, [x10, x26]\n"
- "fmla v27.8h, v5.8h, v16.8h\n"
- "ldr q16, [x27, x8]\n"
- "fmla v26.8h, v6.8h, v18.8h\n"
- "ldr q18, [x13, x8]\n"
- "fmla v25.8h, v7.8h, v17.8h\n"
- "fmla v22.8h, v5.8h, v17.8h\n"
- "fmla v24.8h, v6.8h, v17.8h\n"
- "fmla v21.8h, v5.8h, v19.8h\n"
- "fmla v20.8h, v6.8h, v16.8h\n"
- "fmla v26.8h, v8.8h, v17.8h\n"
- "fmla v22.8h, v7.8h, v16.8h\n"
- "ldr q17, [x27, x9]\n"
- "fmla v29.8h, v3.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v18.8h\n"
- "fmla v24.8h, v8.8h, v19.8h\n"
- "ldr q16, [x13, x9]\n"
- "fmla v20.8h, v8.8h, v17.8h\n"
- "add x13, x13, #0x10\n"
- "fmla v21.8h, v7.8h, v17.8h\n"
- "ldr q19, [x10, x9]\n"
- "fmla v28.8h, v4.8h, v18.8h\n"
- "fmla v26.8h, v1.8h, v18.8h\n"
- "ldr q17, [x10, x8]\n"
- "fmla v29.8h, v5.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "ldr q18, [x26, x12]\n"
+ "fmla v25.8h, v8.8h, v17.8h\n"
+ "ldr q16, [x10, x12]\n"
+ "fmla v27.8h, v1.8h, v19.8h\n"
+ "ldr q17, [x28, x7]\n"
"add x10, x10, #0x10\n"
- "fmla v27.8h, v4.8h, v16.8h\n"
- "fmla v25.8h, v2.8h, v16.8h\n"
- "fmla v24.8h, v1.8h, v16.8h\n"
- "ldr q16, [x16, x11]\n"
- "fmla v22.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v18.8h\n"
+ "fmla v23.8h, v7.8h, v18.8h\n"
+ "ldr q19, [x28, x12]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v2.8h, v16.8h\n"
+ "fmla v25.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x13]\n"
+ "fmla v24.8h, v4.8h, v17.8h\n"
"add x16, x16, #0x10\n"
- "fmla v20.8h, v3.8h, v17.8h\n"
- "fmla v21.8h, v4.8h, v19.8h\n"
- "fmla v26.8h, v7.8h, v17.8h\n"
- "fmla v25.8h, v6.8h, v17.8h\n"
- "ld1 { v18.8h }, [x12]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v1.8h, v16.8h\n"
- "fmax v29.8h, v29.8h, v15.8h\n"
- "fmla v27.8h, v0.8h, v16.8h\n"
- "ldr q17, [x12, x26]\n"
- "fmla v24.8h, v7.8h, v19.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "fmla v20.8h, v5.8h, v19.8h\n"
- "fmla v22.8h, v0.8h, v18.8h\n"
- "add x12, x12, #0x10\n"
- "fmla v21.8h, v2.8h, v17.8h\n"
- "fmla v25.8h, v8.8h, v19.8h\n"
- "ldr q16, [x27, x11]\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "fmla v28.8h, v6.8h, v18.8h\n"
- "fmla v26.8h, v3.8h, v18.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "add x27, x27, #0x10\n"
- "fmla v27.8h, v8.8h, v17.8h\n"
- "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v27.8h, v7.8h, v17.8h\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "fmla v26.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x9]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v30.8h, v1.8h, v16.8h\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x9, x11]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "add x9, x9, #0x10\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v24.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x26, x13]\n"
+ "fmla v27.8h, v3.8h, v18.8h\n"
+ "add x26, x26, #0x10\n"
+ "fmla v23.8h, v2.8h, v17.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmla v28.8h, v8.8h, v17.8h\n"
+ "fmla v25.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "fmla v22.8h, v8.8h, v16.8h\n"
- "fmla v20.8h, v7.8h, v16.8h\n"
"fmax v26.8h, v26.8h, v15.8h\n"
- "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmla v23.8h, v6.8h, v16.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
"fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "fmax v20.8h, v20.8h, v15.8h\n"
"fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "st1 { v28.8h }, [x15]\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
"fmin v26.8h, v26.8h, v14.8h\n"
- "str q29, [x15, x17]\n"
+ "st1 { v27.8h }, [x27]\n"
"fmin v25.8h, v25.8h, v14.8h\n"
"fmin v24.8h, v24.8h, v14.8h\n"
- "str q27, [x15, x22]\n"
- "add x15, x15, #0x10\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "st1 { v26.8h }, [x28]\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "str q25, [x28, x17]\n"
- "str q24, [x28, x22]\n"
- "add x28, x28, #0x10\n"
- "st1 { v22.8h }, [x25]\n"
- "str q20, [x25, x17]\n"
- "str q21, [x25, x22]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "st1 { v29.8h }, [x15]\n"
+ "str q30, [x15, x8]\n"
+ "str q28, [x15, x20]\n"
+ "add x15, x15, #0x10\n"
+ "str q26, [x27, x8]\n"
+ "str q25, [x27, x20]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v24.8h }, [x25]\n"
+ "str q21, [x25, x8]\n"
+ "str q23, [x25, x20]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 93f\n"
"ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "add x24, x12, x11\n"
+ "add x24, x9, x13\n"
"add x23, x16, XZR\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x22, x16, x26\n"
- "add x21, x27, XZR\n"
+ "add x22, x16, x11\n"
+ "add x21, x26, XZR\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x20, x13, x11\n"
+ "add x20, x10, x13\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -511,23 +511,23 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
"mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
- "add x20, x27, x26\n"
+ "add x20, x26, x11\n"
"mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
"mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
"mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
"mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
+ "fmla v23.8h, v5.8h, v13.8h\n"
+ "fmla v25.8h, v3.8h, v13.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -549,7 +549,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr h12, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v31.8h, v8.8h, v12.8h\n"
- "add x20, x12, x8\n"
+ "add x20, x9, x7\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
@@ -572,7 +572,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x20, x16, x8\n"
+ "add x20, x16, x7\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
@@ -599,7 +599,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v23.8h, v1.8h, v13.8h\n"
"fmla v24.8h, v0.8h, v13.8h\n"
- "add x20, x16, x9\n"
+ "add x20, x16, x12\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -622,7 +622,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
"fmla v24.8h, v2.8h, v12.8h\n"
"fmla v25.8h, v1.8h, v12.8h\n"
- "add x20, x12, x9\n"
+ "add x20, x9, x12\n"
"tbz %x[n_channels], #2, 26f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
@@ -645,7 +645,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x20, x13, XZR\n"
+ "add x20, x10, XZR\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
@@ -672,7 +672,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v23.8h, v3.8h, v11.8h\n"
"fmla v26.8h, v0.8h, v11.8h\n"
- "add x20, x13, x26\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #2, 34f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
@@ -695,7 +695,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v25.8h, v5.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x20, x10, XZR\n"
+ "add x20, x28, XZR\n"
"tbz %x[n_channels], #2, 38f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -718,7 +718,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"40:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v26.8h, v6.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
- "add x20, x10, x11\n"
+ "add x20, x28, x13\n"
"tbz %x[n_channels], #2, 42f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -741,7 +741,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x20, x10, x26\n"
+ "add x20, x28, x11\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
@@ -768,7 +768,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v28.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x27, x8\n"
+ "add x20, x26, x7\n"
"tbz %x[n_channels], #2, 50f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
@@ -791,7 +791,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"52:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v29.8h, v7.8h, v13.8h\n"
"fmla v30.8h, v6.8h, v13.8h\n"
- "add x20, x13, x8\n"
+ "add x20, x10, x7\n"
"tbz %x[n_channels], #2, 54f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
@@ -814,7 +814,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"56:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x20, x13, x9\n"
+ "add x20, x10, x12\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 58f\n"
@@ -839,7 +839,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"60:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x20, x27, x9\n"
+ "add x20, x26, x12\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 62f\n"
@@ -864,7 +864,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"64:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v30.8h, v8.8h, v13.8h\n"
"fmla v31.8h, v7.8h, v13.8h\n"
- "add x20, x10, x8\n"
+ "add x20, x28, x7\n"
"tbz %x[n_channels], #2, 66f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
@@ -887,7 +887,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"68:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x20, x16, x11\n"
+ "add x20, x16, x13\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 70f\n"
@@ -912,7 +912,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"72:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x20, x10, x9\n"
+ "add x20, x28, x12\n"
"fmla v25.8h, v0.8h, v11.8h\n"
"tbz %x[n_channels], #2, 74f\n"
"ldr d13, [x20], #0x8\n"
@@ -936,7 +936,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"76:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x12, XZR\n"
+ "add x20, x9, XZR\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
"tbz %x[n_channels], #2, 78f\n"
@@ -961,7 +961,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"80:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x20, x12, x26\n"
+ "add x20, x9, x11\n"
"fmla v29.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 82f\n"
"ldr d11, [x20], #0x8\n"
@@ -985,7 +985,7 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"84:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x20, x27, x11\n"
+ "add x20, x26, x13\n"
"fmla v31.8h, v2.8h, v11.8h\n"
"tbz %x[n_channels], #2, 86f\n"
"ldr d13, [x20], #0x8\n"
@@ -1030,46 +1030,46 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 90f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.d }[0], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.d }[0], [x21], x17\n"
"add x15, x15, #0x8\n"
- "st1 { v29.d }[0], [x20], x17\n"
- "add x28, x28, #0x8\n"
+ "add x27, x27, #0x8\n"
"add x25, x25, #0x8\n"
- "st1 { v24.d }[0], [x22], x17\n"
- "st1 { v27.d }[0], [x21], x17\n"
- "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v23.d }[0], [x22], x8\n"
+ "st1 { v26.d }[0], [x21], x8\n"
+ "st1 { v29.d }[0], [x20], x8\n"
+ "st1 { v24.d }[0], [x22], x8\n"
+ "st1 { v27.d }[0], [x21], x8\n"
+ "st1 { v30.d }[0], [x20], x8\n"
"st1 { v25.d }[0], [x22]\n"
"st1 { v28.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 89f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.s }[2], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.s }[2], [x21], x17\n"
"add x15, x15, #0x4\n"
- "st1 { v29.s }[2], [x20], x17\n"
- "add x28, x28, #0x4\n"
+ "add x27, x27, #0x4\n"
"add x25, x25, #0x4\n"
- "st1 { v24.s }[2], [x22], x17\n"
- "st1 { v27.s }[2], [x21], x17\n"
- "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v23.s }[2], [x22], x8\n"
+ "st1 { v26.s }[2], [x21], x8\n"
+ "st1 { v29.s }[2], [x20], x8\n"
+ "st1 { v24.s }[2], [x22], x8\n"
+ "st1 { v27.s }[2], [x21], x8\n"
+ "st1 { v30.s }[2], [x20], x8\n"
"st1 { v25.s }[2], [x22]\n"
"st1 { v28.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 92f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.h }[6], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.h }[6], [x21], x17\n"
- "st1 { v29.h }[6], [x20], x17\n"
- "st1 { v24.h }[6], [x22], x17\n"
- "st1 { v27.h }[6], [x21], x17\n"
- "st1 { v30.h }[6], [x20], x17\n"
+ "st1 { v23.h }[6], [x22], x8\n"
+ "st1 { v24.h }[6], [x22], x8\n"
+ "st1 { v26.h }[6], [x21], x8\n"
+ "st1 { v29.h }[6], [x20], x8\n"
+ "st1 { v27.h }[6], [x21], x8\n"
+ "st1 { v30.h }[6], [x20], x8\n"
"st1 { v25.h }[6], [x22]\n"
"st1 { v28.h }[6], [x21]\n"
"st1 { v31.h }[6], [x20]\n"
@@ -1077,14 +1077,14 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"89:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 92f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.h }[4], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.h }[4], [x21], x17\n"
- "st1 { v29.h }[4], [x20], x17\n"
- "st1 { v24.h }[4], [x22], x17\n"
- "st1 { v27.h }[4], [x21], x17\n"
- "st1 { v30.h }[4], [x20], x17\n"
+ "st1 { v23.h }[4], [x22], x8\n"
+ "st1 { v24.h }[4], [x22], x8\n"
+ "st1 { v26.h }[4], [x21], x8\n"
+ "st1 { v29.h }[4], [x20], x8\n"
+ "st1 { v27.h }[4], [x21], x8\n"
+ "st1 { v30.h }[4], [x20], x8\n"
"st1 { v25.h }[4], [x22]\n"
"st1 { v28.h }[4], [x21]\n"
"st1 { v31.h }[4], [x20]\n"
@@ -1092,63 +1092,63 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"90:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 91f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.s }[0], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.s }[0], [x21], x17\n"
"add x15, x15, #0x4\n"
- "st1 { v29.s }[0], [x20], x17\n"
- "add x28, x28, #0x4\n"
+ "add x27, x27, #0x4\n"
"add x25, x25, #0x4\n"
- "st1 { v24.s }[0], [x22], x17\n"
- "st1 { v27.s }[0], [x21], x17\n"
- "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v23.s }[0], [x22], x8\n"
+ "st1 { v26.s }[0], [x21], x8\n"
+ "st1 { v29.s }[0], [x20], x8\n"
+ "st1 { v24.s }[0], [x22], x8\n"
+ "st1 { v27.s }[0], [x21], x8\n"
+ "st1 { v30.s }[0], [x20], x8\n"
"st1 { v25.s }[0], [x22]\n"
"st1 { v28.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 92f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.h }[2], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.h }[2], [x21], x17\n"
- "st1 { v29.h }[2], [x20], x17\n"
- "st1 { v24.h }[2], [x22], x17\n"
- "st1 { v27.h }[2], [x21], x17\n"
- "st1 { v30.h }[2], [x20], x17\n"
+ "st1 { v23.h }[2], [x22], x8\n"
+ "st1 { v24.h }[2], [x22], x8\n"
+ "st1 { v26.h }[2], [x21], x8\n"
+ "st1 { v29.h }[2], [x20], x8\n"
+ "st1 { v27.h }[2], [x21], x8\n"
+ "st1 { v30.h }[2], [x20], x8\n"
"st1 { v25.h }[2], [x22]\n"
"st1 { v28.h }[2], [x21]\n"
"st1 { v31.h }[2], [x20]\n"
"b 92f\n"
"91:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.h }[0], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.h }[0], [x21], x17\n"
- "st1 { v29.h }[0], [x20], x17\n"
- "st1 { v24.h }[0], [x22], x17\n"
- "st1 { v27.h }[0], [x21], x17\n"
- "st1 { v30.h }[0], [x20], x17\n"
+ "st1 { v23.h }[0], [x22], x8\n"
+ "st1 { v24.h }[0], [x22], x8\n"
+ "st1 { v26.h }[0], [x21], x8\n"
+ "st1 { v29.h }[0], [x20], x8\n"
+ "st1 { v27.h }[0], [x21], x8\n"
+ "st1 { v30.h }[0], [x20], x8\n"
"st1 { v25.h }[0], [x22]\n"
"st1 { v28.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"92:" // Tile loop: Oddments: Store: Bit 2: End
"93:" // Tile loop: End
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x23, x23, #0x1\n"
- "add x21, x24, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x23, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x24, x24, x21, LT\n"
- "csel x23, x23, XZR, LT\n"
- "cmp x24, x20\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x9, x9, #0x1\n"
+ "add x20, x10, #0x1\n"
+ "cmp x9, x22\n"
+ "csel x10, x10, x20, LT\n"
+ "csel x9, x9, XZR, LT\n"
+ "cmp x10, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 72e68482c6..c4b0c721cc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,9 +91,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"lsr x8, %x[n_channels], #0x3\n"
"ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.8h }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x21]\n"
"ld1r { v14.8h }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"mov x14, #0x0\n"
@@ -111,357 +111,357 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q7, [x16, #0x80]\n"
"ldr q8, [x16, #0x90]\n"
"add x16, x16, #0xa0\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q9, [x21, x14]\n"
- "ldr q10, [x20, x14]\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q11, [x21, x14]\n"
- "ldr q12, [x20, x14]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
"ldr x20, [x15, #0x20]\n"
+ "ldr q9, [x24, x14]\n"
+ "ldr q10, [x23, x14]\n"
+ "ldr q11, [x22, x14]\n"
+ "ldr q12, [x21, x14]\n"
"ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
- "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
- "ldr x26, [x15, #0x30]\n"
- "ldr x23, [x15, #0x38]\n"
- "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v0.8h, v10.8h\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr q19, [x20, x14]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
- "ldr x21, [x15, #0x40]\n"
- "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
- "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x27, [x15, #0x38]\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x26, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x15, #0x40]\n"
"ldr x25, [x15, #0x50]\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v0.8h, v9.8h\n"
"ldr x24, [x15, #0x58]\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "ldr q17, [x26, x14]\n"
- "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
- "ldr x20, [x15, #0x60]\n"
- "fmla v29.8h, v5.8h, v13.8h\n"
- "fmla v28.8h, v6.8h, v17.8h\n"
- "ldr x12, [x15, #0x70]\n"
- "ldr x11, [x15, #0x88]\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
- "fmla v27.8h, v3.8h, v13.8h\n"
- "ldr x10, [x17, #0x0]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x12, [x15, #0x88]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x22, x14]\n"
+ "fmla v27.8h, v2.8h, v13.8h\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v26.8h, v1.8h, v13.8h\n"
+ "fmla v25.8h, v0.8h, v13.8h\n"
+ "ldr x11, [x17, #0x0]\n"
"add x13, x13, #0x10\n"
- "fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v25.8h, v1.8h, v13.8h\n"
- "ldr x9, [x17, #0x8]\n"
- "ldr x28, [x17, #0x10]\n"
- "fmla v24.8h, v0.8h, v13.8h\n"
- "ldr q18, [x23, x14]\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "ldr q16, [x22, x14]\n"
+ "fmla v24.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x26, x14]\n"
"mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
"ldr q31, [x16, #0x0]\n"
- "fmla v29.8h, v7.8h, v17.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "fmla v28.8h, v0.8h, v18.8h\n"
- "fmla v22.8h, v8.8h, v16.8h\n"
- "ldr q16, [x21, x14]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla v26.8h, v4.8h, v17.8h\n"
- "fmla v25.8h, v3.8h, v17.8h\n"
- "ldr x21, [x15, #0x80]\n"
- "ldr x27, [x17, #0x18]\n"
+ "fmla v30.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v6.8h, v17.8h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x27, x14]\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "ldr x9, [x15, #0x78]\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v26.8h, v3.8h, v17.8h\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v21.8h, v0.8h, v17.8h\n"
- "fmla v24.8h, v4.8h, v19.8h\n"
- "fmla v23.8h, v1.8h, v17.8h\n"
+ "fmla v25.8h, v4.8h, v22.8h\n"
+ "ldr x28, [x17, #0x10]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v30.8h, v7.8h, v17.8h\n"
+ "fmla v29.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
"ldr q17, [x25, x14]\n"
- "fmla v29.8h, v1.8h, v18.8h\n"
- "ldr q20, [x24, x14]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v27.8h, v1.8h, v16.8h\n"
- "ldr q16, [x20, x14]\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
"ldr x26, [x15, #0x90]\n"
- "fmla v25.8h, v5.8h, v19.8h\n"
- "fmla v21.8h, v2.8h, v19.8h\n"
- "ldr x25, [x15, #0xa0]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla v26.8h, v0.8h, v17.8h\n"
- "fmla v24.8h, v2.8h, v20.8h\n"
- "fmla v28.8h, v8.8h, v19.8h\n"
- "fmla v27.8h, v7.8h, v19.8h\n"
- "fmla v22.8h, v1.8h, v19.8h\n"
- "ldr q19, [x23, x14]\n"
- "fmla v23.8h, v3.8h, v16.8h\n"
- "ldr x24, [x15, #0xa8]\n"
- "fmla v26.8h, v6.8h, v16.8h\n"
- "ldr q18, [x21, x14]\n"
- "fmla v25.8h, v7.8h, v19.8h\n"
- "ldr x23, [x15, #0xc0]\n"
- "fmla v24.8h, v6.8h, v19.8h\n"
- "fmla v21.8h, v4.8h, v19.8h\n"
- "fmla v29.8h, v3.8h, v17.8h\n"
- "ldr q17, [x12, x14]\n"
- "fmla v27.8h, v5.8h, v20.8h\n"
- "ldr q16, [x22, x14]\n"
- "fmla v23.8h, v5.8h, v19.8h\n"
- "fmla v22.8h, v3.8h, v19.8h\n"
- "ldr x22, [x15, #0xb0]\n"
- "ldr x21, [x15, #0xb8]\n"
- "fmla v26.8h, v8.8h, v19.8h\n"
- "fmla v24.8h, v8.8h, v17.8h\n"
- "fmla v21.8h, v6.8h, v16.8h\n"
- "fmla v28.8h, v3.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v18.8h\n"
- "fmla v22.8h, v5.8h, v17.8h\n"
- "ldr q17, [x11, x14]\n"
- "fmla v23.8h, v7.8h, v16.8h\n"
- "ldr q16, [x26, x14]\n"
- "fmla v29.8h, v4.8h, v18.8h\n"
- "fmla v26.8h, v1.8h, v18.8h\n"
+ "fmla v26.8h, v5.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v2.8h, v22.8h\n"
+ "fmla v30.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.8h, v7.8h, v22.8h\n"
+ "fmla v25.8h, v2.8h, v20.8h\n"
+ "fmla v24.8h, v3.8h, v16.8h\n"
+ "fmla v30.8h, v3.8h, v17.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla v29.8h, v8.8h, v22.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
"ldr q18, [x20, x14]\n"
- "fmla v28.8h, v5.8h, v17.8h\n"
- "fmla v27.8h, v4.8h, v17.8h\n"
- "fmla v25.8h, v2.8h, v17.8h\n"
- "fmla v24.8h, v1.8h, v17.8h\n"
- "ldr q17, [x25, x14]\n"
- "fmla v21.8h, v8.8h, v16.8h\n"
- "ldr x20, [x15, #0x20]\n"
- "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x9, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmla v23.8h, v3.8h, v17.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v30.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "fmla v26.8h, v0.8h, v18.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "ldr q16, [x12, x14]\n"
+ "fmla v27.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x25, x14]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v2.8h, v16.8h\n"
+ "fmla v25.8h, v1.8h, v16.8h\n"
"ldr q16, [x24, x14]\n"
- "fmla v29.8h, v2.8h, v17.8h\n"
- "fmla v26.8h, v7.8h, v18.8h\n"
- "fmla v25.8h, v6.8h, v18.8h\n"
- "fmla v23.8h, v4.8h, v18.8h\n"
- "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr x24, [x15, #0x20]\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v7.8h, v17.8h\n"
"ldr q18, [x22, x14]\n"
- "fmla v22.8h, v4.8h, v16.8h\n"
- "ldr q4, [x16, #0x50]\n"
- "fmla v28.8h, v1.8h, v17.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v30.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v27.8h, v0.8h, v17.8h\n"
- "ldr q17, [x21, x14]\n"
- "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v6.8h, v19.8h\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v21.8h, v3.8h, v19.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v25.8h, v7.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "ldr q4, [x16, #0x50]\n"
"fmax v29.8h, v29.8h, v15.8h\n"
- "fmla v24.8h, v7.8h, v16.8h\n"
- "fmla v21.8h, v5.8h, v16.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q29, [x10, x13]\n"
- "fmla v23.8h, v0.8h, v18.8h\n"
+ "fmla v30.8h, v6.8h, v16.8h\n"
+ "fmla v24.8h, v0.8h, v16.8h\n"
"ldr q0, [x16, #0x10]\n"
- "fmla v22.8h, v2.8h, v17.8h\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v25.8h, v8.8h, v16.8h\n"
- "ldr q16, [x23, x14]\n"
- "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmla v26.8h, v8.8h, v18.8h\n"
+ "fmla v27.8h, v3.8h, v16.8h\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.8h, v8.8h, v17.8h\n"
- "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v25.8h, v5.8h, v17.8h\n"
"ldr q5, [x16, #0x60]\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmla v23.8h, v8.8h, v16.8h\n"
+ "fmla v23.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
"ldr q8, [x16, #0x90]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "ldr q9, [x23, x7]\n"
+ "ldr q10, [x22, x7]\n"
"fmla v21.8h, v7.8h, v16.8h\n"
"ldr q7, [x16, #0x80]\n"
- "fmla v22.8h, v6.8h, v16.8h\n"
- "ldr q13, [x20, x7]\n"
- "ldr q6, [x16, #0x70]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "ldr q11, [x21, x7]\n"
+ "ldr q12, [x20, x7]\n"
+ "fmla v23.8h, v6.8h, v16.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "fmax v26.8h, v26.8h, v15.8h\n"
- "fmax v25.8h, v25.8h, v15.8h\n"
- "ldr x24, [x17, #0x20]\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q9, [x21, x7]\n"
- "ldr q10, [x20, x7]\n"
+ "ldr q13, [x24, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q11, [x21, x7]\n"
"fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "ldr q12, [x20, x7]\n"
+ "str q30, [x11, x13]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
"fmax v24.8h, v24.8h, v15.8h\n"
- "fmax v23.8h, v23.8h, v15.8h\n"
- "str q28, [x9, x13]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
"fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "str q27, [x28, x13]\n"
+ "str q29, [x10, x13]\n"
"ldr x23, [x17, #0x28]\n"
- "str q26, [x27, x13]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q28, [x28, x13]\n"
"ldr x22, [x17, #0x30]\n"
- "ldr x21, [x17, #0x38]\n"
"add x7, x7, #0x10\n"
- "str q25, [x24, x13]\n"
- "ldr x20, [x17, #0x40]\n"
+ "str q26, [x20, x13]\n"
+ "ldr x21, [x17, #0x40]\n"
"cmp x7, x8, LSL #4\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x27, x13]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "add x14, x14, #0x10\n"
- "str q24, [x23, x13]\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "str q23, [x22, x13]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
"add x16, x16, #0xa0\n"
- "str q21, [x21, x13]\n"
- "str q22, [x20, x13]\n"
+ "str q25, [x23, x13]\n"
+ "str q24, [x22, x13]\n"
+ "str q21, [x20, x13]\n"
+ "str q23, [x21, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
- "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
- "ldr x23, [x15, #0x30]\n"
- "ldr x22, [x15, #0x38]\n"
- "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
- "fmla v29.8h, v0.8h, v10.8h\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr q19, [x20, x14]\n"
- "fmla v28.8h, v4.8h, v13.8h\n"
- "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x27, [x15, #0x38]\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x26, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v3.8h, v9.8h\n"
"ldr x20, [x15, #0x40]\n"
- "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
- "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
"ldr x25, [x15, #0x50]\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v0.8h, v9.8h\n"
"ldr x24, [x15, #0x58]\n"
- "fmla v27.8h, v2.8h, v11.8h\n"
- "ldr q17, [x23, x14]\n"
- "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
"ldr x23, [x15, #0x60]\n"
- "fmla v29.8h, v5.8h, v13.8h\n"
- "fmla v28.8h, v6.8h, v17.8h\n"
- "ldr x12, [x15, #0x70]\n"
- "ldr x11, [x15, #0x88]\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
- "fmla v27.8h, v3.8h, v13.8h\n"
- "ldr x10, [x17, #0x0]\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x12, [x15, #0x88]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x22, x14]\n"
+ "fmla v27.8h, v2.8h, v13.8h\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v26.8h, v1.8h, v13.8h\n"
+ "fmla v25.8h, v0.8h, v13.8h\n"
+ "ldr x11, [x17, #0x0]\n"
"add x13, x13, #0x10\n"
- "fmla v26.8h, v2.8h, v13.8h\n"
- "fmla v25.8h, v1.8h, v13.8h\n"
- "ldr x9, [x17, #0x8]\n"
- "ldr x28, [x17, #0x10]\n"
- "fmla v24.8h, v0.8h, v13.8h\n"
- "ldr q18, [x22, x14]\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "ldr q16, [x21, x14]\n"
+ "fmla v24.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x26, x14]\n"
"mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
- "fmla v29.8h, v7.8h, v17.8h\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x78]\n"
- "fmla v28.8h, v0.8h, v18.8h\n"
- "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla v30.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v6.8h, v17.8h\n"
+ "ldr x10, [x17, #0x8]\n"
+ "ldr x9, [x17, #0x10]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q18, [x27, x14]\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "ldr x28, [x15, #0x78]\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
"ldr q16, [x20, x14]\n"
+ "fmla v26.8h, v3.8h, v17.8h\n"
"ldr x20, [x15, #0x80]\n"
- "fmla v26.8h, v4.8h, v17.8h\n"
- "fmla v25.8h, v3.8h, v17.8h\n"
- "ldr x27, [x17, #0x18]\n"
"fmla v21.8h, v0.8h, v17.8h\n"
- "fmla v24.8h, v4.8h, v19.8h\n"
- "fmla v23.8h, v1.8h, v17.8h\n"
+ "fmla v25.8h, v4.8h, v22.8h\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v30.8h, v7.8h, v17.8h\n"
+ "fmla v29.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
"ldr q17, [x25, x14]\n"
- "fmla v29.8h, v1.8h, v18.8h\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v26.8h, v5.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v2.8h, v22.8h\n"
+ "fmla v30.8h, v1.8h, v18.8h\n"
"ldr q20, [x24, x14]\n"
- "fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
"ldr q16, [x23, x14]\n"
- "ldr x26, [x15, #0x90]\n"
- "fmla v25.8h, v5.8h, v19.8h\n"
- "fmla v21.8h, v2.8h, v19.8h\n"
- "ldr x25, [x15, #0xa0]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v26.8h, v0.8h, v17.8h\n"
- "fmla v24.8h, v2.8h, v20.8h\n"
- "fmla v28.8h, v8.8h, v19.8h\n"
- "fmla v27.8h, v7.8h, v19.8h\n"
- "fmla v22.8h, v1.8h, v19.8h\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.8h, v7.8h, v22.8h\n"
+ "fmla v25.8h, v2.8h, v20.8h\n"
+ "fmla v24.8h, v3.8h, v16.8h\n"
+ "fmla v30.8h, v3.8h, v17.8h\n"
"ldr q19, [x22, x14]\n"
- "fmla v23.8h, v3.8h, v16.8h\n"
- "ldr x23, [x15, #0xa8]\n"
- "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla v29.8h, v8.8h, v22.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
"ldr q18, [x20, x14]\n"
- "fmla v25.8h, v7.8h, v19.8h\n"
- "ldr x22, [x15, #0xc0]\n"
- "fmla v24.8h, v6.8h, v19.8h\n"
- "fmla v21.8h, v4.8h, v19.8h\n"
- "fmla v29.8h, v3.8h, v17.8h\n"
- "ldr q17, [x12, x14]\n"
- "fmla v27.8h, v5.8h, v20.8h\n"
- "ldr q16, [x21, x14]\n"
- "fmla v23.8h, v5.8h, v19.8h\n"
- "fmla v22.8h, v3.8h, v19.8h\n"
- "ldr x21, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x28, x14]\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla v26.8h, v8.8h, v19.8h\n"
- "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmla v23.8h, v3.8h, v17.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v30.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "fmla v26.8h, v0.8h, v18.8h\n"
"fmla v21.8h, v6.8h, v16.8h\n"
- "fmla v28.8h, v3.8h, v18.8h\n"
- "fmla v25.8h, v0.8h, v18.8h\n"
- "fmla v22.8h, v5.8h, v17.8h\n"
- "ldr q17, [x11, x14]\n"
- "fmla v23.8h, v7.8h, v16.8h\n"
- "ldr q16, [x26, x14]\n"
- "fmla v29.8h, v4.8h, v18.8h\n"
- "fmla v26.8h, v1.8h, v18.8h\n"
- "ldr q18, [x24, x14]\n"
- "fmla v28.8h, v5.8h, v17.8h\n"
- "fmla v27.8h, v4.8h, v17.8h\n"
- "fmla v25.8h, v2.8h, v17.8h\n"
- "fmla v24.8h, v1.8h, v17.8h\n"
- "ldr q17, [x25, x14]\n"
- "fmla v21.8h, v8.8h, v16.8h\n"
- "fmla v22.8h, v7.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "ldr q16, [x12, x14]\n"
+ "fmla v27.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x25, x14]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v2.8h, v16.8h\n"
+ "fmla v25.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v7.8h, v17.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v30.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v26.8h, v6.8h, v19.8h\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v21.8h, v3.8h, v19.8h\n"
"ldr q16, [x23, x14]\n"
- "fmla v29.8h, v2.8h, v17.8h\n"
- "fmla v26.8h, v7.8h, v18.8h\n"
- "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v25.8h, v7.8h, v18.8h\n"
"fmla v23.8h, v4.8h, v18.8h\n"
- "fmla v21.8h, v3.8h, v18.8h\n"
- "ldr q18, [x21, x14]\n"
- "fmla v22.8h, v4.8h, v16.8h\n"
- "fmla v28.8h, v1.8h, v17.8h\n"
- "fmax v28.8h, v28.8h, v15.8h\n"
- "fmla v27.8h, v0.8h, v17.8h\n"
- "ldr q17, [x20, x14]\n"
- "fmla v29.8h, v6.8h, v18.8h\n"
"fmax v29.8h, v29.8h, v15.8h\n"
- "fmla v24.8h, v7.8h, v16.8h\n"
- "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmla v30.8h, v6.8h, v16.8h\n"
+ "fmla v24.8h, v0.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v18.8h\n"
+ "fmla v27.8h, v3.8h, v16.8h\n"
+ "fmla v28.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v25.8h, v5.8h, v17.8h\n"
"fmin v29.8h, v29.8h, v14.8h\n"
- "str q29, [x10, x13]\n"
- "fmla v23.8h, v0.8h, v18.8h\n"
- "fmla v22.8h, v2.8h, v17.8h\n"
- "ldr x20, [x17, #0x20]\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "fmla v25.8h, v8.8h, v16.8h\n"
- "ldr q16, [x22, x14]\n"
- "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmla v23.8h, v2.8h, v17.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "add x14, x14, #0x10\n"
"fmax v26.8h, v26.8h, v15.8h\n"
- "fmla v27.8h, v8.8h, v17.8h\n"
- "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
"fmax v27.8h, v27.8h, v15.8h\n"
- "str q28, [x9, x13]\n"
- "fmla v23.8h, v8.8h, v16.8h\n"
+ "str q29, [x10, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
"fmla v21.8h, v7.8h, v16.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmla v23.8h, v6.8h, v16.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
"fmax v25.8h, v25.8h, v15.8h\n"
- "ldr x23, [x17, #0x28]\n"
- "fmla v22.8h, v6.8h, v16.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
"fmin v27.8h, v27.8h, v14.8h\n"
- "str q27, [x28, x13]\n"
- "ldr x22, [x17, #0x30]\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "str q26, [x27, x13]\n"
- "ldr x21, [x17, #0x38]\n"
+ "str q30, [x11, x13]\n"
+ "ldr x20, [x17, #0x20]\n"
"fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
"fmax v23.8h, v23.8h, v15.8h\n"
- "str q25, [x20, x13]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q28, [x9, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "str q27, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "str q26, [x20, x13]\n"
"ldr x20, [x17, #0x40]\n"
- "fmax v21.8h, v21.8h, v15.8h\n"
- "fmax v22.8h, v22.8h, v15.8h\n"
- "add x14, x14, #0x10\n"
"fmin v24.8h, v24.8h, v14.8h\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "str q24, [x23, x13]\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "fmin v22.8h, v22.8h, v14.8h\n"
- "str q23, [x22, x13]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "str q25, [x23, x13]\n"
+ "str q24, [x22, x13]\n"
"str q21, [x21, x13]\n"
- "str q22, [x20, x13]\n"
+ "str q23, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 92f\n"
@@ -478,13 +478,13 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q8, [x16, #0x90]\n"
"ldr x24, [x15, #0x0]\n"
"ldr x23, [x15, #0x8]\n"
- "add x24, x24, x14\n"
- "add x23, x23, x14\n"
"ldr x22, [x15, #0x10]\n"
"ldr x21, [x15, #0x18]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
"add x22, x22, x14\n"
"add x21, x21, x14\n"
- "ldr x20, [x15, #0x20]\n"
"add x20, x20, x14\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
@@ -537,23 +537,23 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
"mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
"ldr x20, [x15, #0x28]\n"
- "add x20, x20, x14\n"
"mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
"mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
"mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
"mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
"mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "add x20, x20, x14\n"
"fmla v23.8h, v0.8h, v10.8h\n"
"fmla v25.8h, v2.8h, v11.8h\n"
- "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
"fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "fmla v23.8h, v5.8h, v13.8h\n"
"fmla v24.8h, v4.8h, v13.8h\n"
- "fmla v25.8h, v3.8h, v13.8h\n"
"fmla v26.8h, v2.8h, v13.8h\n"
"fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
"fmla v28.8h, v0.8h, v13.8h\n"
+ "fmla v23.8h, v5.8h, v13.8h\n"
+ "fmla v25.8h, v3.8h, v13.8h\n"
"tbz %x[n_channels], #2, 9f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
@@ -600,11 +600,11 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x38]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
"fmla v24.8h, v6.8h, v11.8h\n"
- "add x20, x20, x14\n"
"fmla v26.8h, v4.8h, v11.8h\n"
"fmla v27.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v11.8h\n"
"fmla v30.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -676,11 +676,11 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x50]\n"
"fmla v24.8h, v8.8h, v10.8h\n"
"fmla v25.8h, v7.8h, v10.8h\n"
- "add x20, x20, x14\n"
"fmla v27.8h, v5.8h, v10.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
"fmla v31.8h, v1.8h, v10.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -776,11 +776,11 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x70]\n"
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "add x20, x20, x14\n"
"fmla v28.8h, v6.8h, v10.8h\n"
"fmla v29.8h, v5.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v10.8h\n"
"fmla v31.8h, v3.8h, v10.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -852,9 +852,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x88]\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v24.8h, v3.8h, v12.8h\n"
- "add x20, x20, x14\n"
"fmla v26.8h, v1.8h, v12.8h\n"
"fmla v27.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 57f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
@@ -878,9 +878,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x90]\n"
"fmla v24.8h, v5.8h, v11.8h\n"
"fmla v25.8h, v4.8h, v11.8h\n"
- "add x20, x20, x14\n"
"fmla v27.8h, v2.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 61f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
@@ -928,9 +928,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xa0]\n"
"fmla v26.8h, v7.8h, v12.8h\n"
"fmla v27.8h, v6.8h, v12.8h\n"
- "add x20, x20, x14\n"
"fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v12.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 69f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -954,8 +954,8 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xa8]\n"
"fmla v23.8h, v2.8h, v11.8h\n"
"fmla v24.8h, v1.8h, v11.8h\n"
- "add x20, x20, x14\n"
"fmla v25.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
@@ -979,9 +979,9 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xb0]\n"
"fmla v27.8h, v8.8h, v13.8h\n"
"fmla v28.8h, v7.8h, v13.8h\n"
- "add x20, x20, x14\n"
"fmla v30.8h, v5.8h, v13.8h\n"
"fmla v31.8h, v4.8h, v13.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 77f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
@@ -1005,8 +1005,8 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xb8]\n"
"fmla v23.8h, v6.8h, v12.8h\n"
"fmla v26.8h, v3.8h, v12.8h\n"
- "add x20, x20, x14\n"
"fmla v29.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 81f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
@@ -1030,8 +1030,8 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xc0]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
- "add x20, x20, x14\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
@@ -1075,206 +1075,206 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"fmin v31.8h, v31.8h, v14.8h\n"
"tbz %x[n_channels], #2, 89f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.d }[0], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.d }[0], [x23]\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.d }[0], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.d }[0], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
- "add x13, x13, #0x8\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.d }[0], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.d }[0], [x22]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 88f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.s }[2], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.s }[2], [x23]\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.s }[2], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.s }[2], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
- "add x13, x13, #0x4\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.s }[2], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.s }[2], [x22]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.h }[6], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.h }[6], [x23]\n"
"st1 { v25.h }[6], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.h }[6], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.h }[6], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.h }[6], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.h }[6], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.h }[6], [x21]\n"
"st1 { v31.h }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 91f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.h }[4], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.h }[4], [x23]\n"
"st1 { v25.h }[4], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.h }[4], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.h }[4], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.h }[4], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.h }[4], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.h }[4], [x21]\n"
"st1 { v31.h }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 90f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.s }[0], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.s }[0], [x23]\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.s }[0], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.s }[0], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
- "add x13, x13, #0x4\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.s }[0], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.s }[0], [x22]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 91f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.h }[2], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.h }[2], [x23]\n"
"st1 { v25.h }[2], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.h }[2], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.h }[2], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.h }[2], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.h }[2], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.h }[2], [x21]\n"
"st1 { v31.h }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.h }[0], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.h }[0], [x23]\n"
"st1 { v25.h }[0], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.h }[0], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.h }[0], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.h }[0], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.h }[0], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"91:" // Oddments: Store: Bit 2: End
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index a1e1dd0e99..e88bdcc5be 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,56 +87,56 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x28, #0x0\n"
"mov x27, #0x0\n"
- "mov x26, #0x0\n"
"1:" // Tile loop
- "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x25, #0x4\n"
- "mov x23, #0x4\n"
- "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x21, #0x4\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x3, #0x10\n" // cntb _, ALL, #1
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
"ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
- "mov x6, #0x10\n" // cntb _, ALL, #1
- "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
- "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x4, x4, #0x1\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
- "lsl x5, x5, #0x1\n"
- "add x17, x4, x4\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x7, x7, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x15, x7, x24, LSL #1\n"
- "mul x20, x20, x23\n" // offset *= output_tile_size
- "add x14, x15, x24, LSL #1\n"
- "add x8, x8, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "lsr x13, %x[n_channels], #0x3\n"
- "add x12, x14, x24, LSL #1\n"
- "add x11, x17, x4\n"
- "add x10, x8, x22, LSL #1\n"
- "add x9, x12, x24, LSL #1\n"
- "add x28, x11, x4\n"
- "add x27, x10, x22, LSL #1\n"
- "add x23, x5, x5\n"
+ "lsr x6, %x[n_channels], #0x3\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x8, #0x0\n"
"ld1r { v15.8h }, [x20]\n"
- "add x26, x9, x24, LSL #1\n"
- "add x25, x28, x4\n"
- "add x24, x27, x22, LSL #1\n"
- "add x22, x23, x5\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x6\n"
- "cbz x13, 4f\n"
- "ldr q14, [x16, #0x0]\n"
+ "mul x24, x28, x25\n" // offset = tile_i * ld_input_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x23, XZR, x3\n"
+ "mul x22, x28, x2\n" // offset = tile_i * ld_output_row
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x24, x27, x4, x24\n" // offset += tile_j * ld_input_col
+ "lsl x4, x4, #0x1\n"
+ "madd x22, x27, x5, x22\n" // offset += tile_j * ld_output_col
+ "lsl x5, x5, #0x1\n"
+ "mul x24, x24, x26\n" // offset *= kernel_stride * output_size
+ "add x15, x4, x4\n"
+ "add x14, x15, x4\n"
+ "add x13, x14, x4\n"
+ "mul x22, x22, x21\n" // offset *= output_tile_size
+ "add x21, x5, x5\n"
+ "add x12, x13, x4\n"
+ "add x20, x21, x5\n"
+ "add x7, x7, x24, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x11, x7, x25, LSL #1\n"
+ "add x10, x11, x25, LSL #1\n"
+ "add x17, x17, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x9, x10, x25, LSL #1\n"
+ "add x28, x17, x2, LSL #1\n"
+ "add x27, x9, x25, LSL #1\n"
+ "add x26, x28, x2, LSL #1\n"
+ "add x25, x27, x25, LSL #1\n"
+ "add x24, x26, x2, LSL #1\n"
+ "cbz x6, 4f\n"
+ "ldr q13, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "cmp x6, x13, LSL #4\n"
+ "cmp x3, x6, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
@@ -146,512 +146,512 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q7, [x16, #0x80]\n"
"ldr q8, [x16, #0x90]\n"
"add x16, x16, #0xa0\n"
- "ldr q9, [x14, x17]\n"
+ "ldr q9, [x10, x15]\n"
"ld1 { v10.8h }, [x7]\n"
- "ldr q11, [x7, x25]\n"
- "ldr q12, [x14, x11]\n"
+ "ldr q11, [x7, x12]\n"
+ "ldr q12, [x10, x14]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
- "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "add x6, x6, #0x10\n"
- "cmp x6, x13, LSL #4\n"
- "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
- "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
- "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
- "fmla v26.8h, v5.8h, v12.8h\n"
- "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
- "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
- "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
- "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
- "ldr q9, [x12, x17]\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ld1 { v30.8h }, [x26]\n"
- "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q27, [x26, x25]\n"
- "fmla v16.8h, v4.8h, v12.8h\n"
- "fmla v22.8h, v2.8h, v12.8h\n"
- "fmla v23.8h, v1.8h, v12.8h\n"
- "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
- "ldr q10, [x12, x11]\n"
- "fmla v26.8h, v7.8h, v9.8h\n"
- "fmla v25.8h, v8.8h, v12.8h\n"
- "fmla v17.8h, v7.8h, v12.8h\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
- "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q11, [x7, x4]\n"
- "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
- "ldr q12, [x7, x28]\n"
- "fmla v16.8h, v6.8h, v9.8h\n"
- "fmla v22.8h, v4.8h, v9.8h\n"
- "fmla v23.8h, v3.8h, v9.8h\n"
- "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
- "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
- "ldr q14, [x16, #0x0]\n"
- "fmla v31.8h, v8.8h, v9.8h\n"
- "fmla v20.8h, v5.8h, v9.8h\n"
- "fmla v21.8h, v2.8h, v9.8h\n"
- "ld1 { v9.8h }, [x15]\n"
- "fmla v26.8h, v8.8h, v10.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
- "fmla v25.8h, v0.8h, v11.8h\n"
- "ldr q11, [x15, x25]\n"
- "fmla v17.8h, v2.8h, v12.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "ld1 { v12.8h }, [x9]\n"
- "fmla v16.8h, v7.8h, v10.8h\n"
- "fmla v24.8h, v6.8h, v10.8h\n"
- "fmla v22.8h, v5.8h, v10.8h\n"
- "fmla v23.8h, v4.8h, v10.8h\n"
- "fmla v19.8h, v3.8h, v10.8h\n"
- "fmla v27.8h, v2.8h, v10.8h\n"
- "fmla v18.8h, v1.8h, v10.8h\n"
- "fmla v30.8h, v0.8h, v10.8h\n"
- "ldr q10, [x15, x17]\n"
- "fmla v31.8h, v0.8h, v9.8h\n"
- "fmla v20.8h, v6.8h, v12.8h\n"
- "fmla v21.8h, v3.8h, v12.8h\n"
- "ldr q12, [x9, x25]\n"
- "fmla v26.8h, v1.8h, v10.8h\n"
- "fmla v28.8h, v3.8h, v9.8h\n"
- "fmla v29.8h, v5.8h, v11.8h\n"
- "fmla v24.8h, v2.8h, v11.8h\n"
- "ldr q11, [x15, x11]\n"
- "fmla v25.8h, v4.8h, v10.8h\n"
- "fmla v17.8h, v3.8h, v10.8h\n"
- "fmla v16.8h, v0.8h, v10.8h\n"
- "fmla v19.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v5.8h, v12.8h\n"
- "ldr q9, [x26, x4]\n"
- "fmla v31.8h, v2.8h, v10.8h\n"
- "fmla v26.8h, v2.8h, v11.8h\n"
- "fmla v28.8h, v5.8h, v10.8h\n"
- "ldr q10, [x14, x4]\n"
- "fmla v25.8h, v5.8h, v11.8h\n"
- "fmla v17.8h, v4.8h, v11.8h\n"
- "fmla v29.8h, v3.8h, v11.8h\n"
- "fmla v16.8h, v1.8h, v11.8h\n"
- "fmla v24.8h, v0.8h, v11.8h\n"
- "ldr q11, [x14, x28]\n"
- "fmla v21.8h, v7.8h, v9.8h\n"
- "fmla v27.8h, v6.8h, v9.8h\n"
- "ldr q12, [x26, x28]\n"
- "fmla v31.8h, v4.8h, v10.8h\n"
- "fmla v26.8h, v3.8h, v10.8h\n"
- "fmla v20.8h, v1.8h, v10.8h\n"
- "fmla v22.8h, v0.8h, v10.8h\n"
- "fmla v28.8h, v7.8h, v10.8h\n"
- "fmla v25.8h, v6.8h, v10.8h\n"
- "ldr q10, [x7, x17]\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v30.8h, v7.8h, v12.8h\n"
- "ldr q9, [x12, x4]\n"
- "fmla v17.8h, v8.8h, v11.8h\n"
- "fmla v29.8h, v7.8h, v11.8h\n"
- "fmla v16.8h, v5.8h, v11.8h\n"
- "fmla v24.8h, v4.8h, v11.8h\n"
- "fmla v23.8h, v2.8h, v11.8h\n"
- "fmla v19.8h, v1.8h, v11.8h\n"
- "ldr q12, [x7, x11]\n"
- "add x7, x7, #0x10\n"
+ "mov v31.16b, v13.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+ "add x3, x3, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+ "cmp x3, x6, LSL #4\n"
+ "add x8, x8, #0x10\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v7.8h, v9.8h\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v5.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x9, x15]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ld1 { v26.8h }, [x25]\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x25, x12]\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "fmla v22.8h, v8.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v6.8h, v26.8h\n"
+ "ldr q11, [x9, x14]\n"
"fmla v31.8h, v7.8h, v9.8h\n"
- "fmla v26.8h, v6.8h, v9.8h\n"
- "fmla v20.8h, v4.8h, v9.8h\n"
- "fmla v22.8h, v3.8h, v9.8h\n"
- "fmla v21.8h, v1.8h, v9.8h\n"
- "fmla v27.8h, v0.8h, v9.8h\n"
- "ldr q9, [x12, x28]\n"
- "fmla v28.8h, v2.8h, v10.8h\n"
- "fmla v25.8h, v1.8h, v10.8h\n"
- "fmla v17.8h, v0.8h, v10.8h\n"
- "ld1 { v10.8h }, [x14]\n"
- "fmla v18.8h, v2.8h, v9.8h\n"
- "fmla v29.8h, v0.8h, v12.8h\n"
- "fmla v31.8h, v3.8h, v10.8h\n"
- "fmla v20.8h, v0.8h, v10.8h\n"
- "fmla v16.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v7.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v9.8h\n"
- "fmla v19.8h, v4.8h, v9.8h\n"
- "fmla v30.8h, v1.8h, v9.8h\n"
- "ldr q11, [x9, x17]\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
- "fmla v17.8h, v1.8h, v12.8h\n"
- "ldr q12, [x14, x25]\n"
- "add x14, x14, #0x10\n"
- "ldr q9, [x14, x17]\n"
- "fmla v28.8h, v6.8h, v10.8h\n"
- "ld1 { v10.8h }, [x12]\n"
- "fmla v27.8h, v4.8h, v11.8h\n"
- "fmla v18.8h, v3.8h, v11.8h\n"
- "fmla v29.8h, v8.8h, v12.8h\n"
- "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v7.8h, v12.8h\n"
+ "fmla v18.8h, v6.8h, v12.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v3.8h, v12.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q10, [x7, x4]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v17.8h\n"
+ "ldr q12, [x7, x13]\n"
+ "fmla v23.8h, v6.8h, v9.8h\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ldr q13, [x16, #0x0]\n"
+ "fmla v29.8h, v8.8h, v9.8h\n"
+ "fmla v30.8h, v5.8h, v9.8h\n"
+ "fmla v20.8h, v2.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x11]\n"
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x12]\n"
"fmla v19.8h, v2.8h, v12.8h\n"
- "ldr q12, [x12, x25]\n"
- "add x12, x12, #0x10\n"
- "fmla v31.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v3.8h, v10.8h\n"
- "fmla v21.8h, v0.8h, v10.8h\n"
- "ldr q10, [x26, x17]\n"
- "fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v27.8h, v7.8h, v10.8h\n"
- "fmla v18.8h, v6.8h, v10.8h\n"
- "fmla v20.8h, v8.8h, v11.8h\n"
- "fmla v22.8h, v7.8h, v11.8h\n"
- "fmla v23.8h, v6.8h, v11.8h\n"
- "fmla v21.8h, v5.8h, v11.8h\n"
- "ldr q11, [x9, x11]\n"
- "fmla v19.8h, v5.8h, v12.8h\n"
- "fmla v27.8h, v5.8h, v11.8h\n"
- "fmla v18.8h, v4.8h, v11.8h\n"
- "fmla v30.8h, v3.8h, v11.8h\n"
- "fmla v24.8h, v8.8h, v12.8h\n"
- "ldr q12, [x26, x11]\n"
- "fmla v21.8h, v8.8h, v10.8h\n"
- "ldr q10, [x15, x4]\n"
- "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x27]\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x26, x26, #0x10\n"
- "fmla v19.8h, v6.8h, v11.8h\n"
- "ldr q11, [x15, x28]\n"
- "fmla v27.8h, v8.8h, v12.8h\n"
- "add x15, x15, #0x10\n"
- "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v17.8h, v1.8h, v11.8h\n"
+ "fmla v16.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x11, x15]\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
"fmla v30.8h, v6.8h, v12.8h\n"
- "ldr q12, [x9, x4]\n"
- "fmla v28.8h, v4.8h, v10.8h\n"
- "fmla v25.8h, v3.8h, v10.8h\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x27, x12]\n"
+ "fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x11, x14]\n"
+ "fmla v22.8h, v4.8h, v11.8h\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "fmla v23.8h, v0.8h, v11.8h\n"
+ "fmla v27.8h, v8.8h, v12.8h\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "ldr q9, [x25, x4]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v24.8h, v5.8h, v11.8h\n"
+ "ldr q12, [x10, x4]\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v19.8h, v4.8h, v10.8h\n"
+ "fmla v18.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
"fmla v26.8h, v0.8h, v10.8h\n"
- "ldr q10, [x9, x28]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v17.8h, v5.8h, v11.8h\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
+ "ldr q11, [x10, x13]\n"
+ "fmla v20.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x25, x13]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "fmla v24.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q10, [x7, x15]\n"
+ "fmla v17.8h, v8.8h, v9.8h\n"
+ "fmla v16.8h, v7.8h, v9.8h\n"
+ "ldr q9, [x9, x4]\n"
+ "fmla v19.8h, v8.8h, v11.8h\n"
+ "fmla v18.8h, v7.8h, v11.8h\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v4.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v11.8h\n"
+ "fmla v27.8h, v1.8h, v11.8h\n"
+ "ldr q12, [x7, x14]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v29.8h, v7.8h, v9.8h\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "fmla v20.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v22.8h, v1.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x10]\n"
+ "fmla v18.8h, v0.8h, v12.8h\n"
+ "fmla v17.8h, v2.8h, v9.8h\n"
+ "fmla v23.8h, v8.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v21.8h, v5.8h, v9.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v9.8h\n"
+ "fmla v16.8h, v1.8h, v9.8h\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v19.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x10, x12]\n"
+ "add x10, x10, #0x10\n"
+ "ldr q9, [x10, x15]\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x9]\n"
+ "fmla v25.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v27.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x9, x12]\n"
+ "fmla v29.8h, v6.8h, v10.8h\n"
"add x9, x9, #0x10\n"
- "fmla v16.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x15]\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "fmla v26.8h, v8.8h, v12.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "fmla v20.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v25.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v27.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v20.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x11, x4]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v25.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x27, x4]\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v22.8h, v3.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
"ldr q2, [x16, #0x30]\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "ldr q11, [x7, x25]\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x7, x12]\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v20.8h, v7.8h, v12.8h\n"
- "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
"ldr q6, [x16, #0x70]\n"
- "fmla v21.8h, v4.8h, v12.8h\n"
- "fmla v27.8h, v3.8h, v12.8h\n"
- "ldr q12, [x14, x11]\n"
+ "fmla v20.8h, v4.8h, v12.8h\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v23.8h, v8.8h, v10.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
"ldr q8, [x16, #0x90]\n"
- "fmla v19.8h, v7.8h, v10.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
"ldr q7, [x16, #0x80]\n"
- "fmla v18.8h, v5.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
"ld1 { v10.8h }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v14.8h\n"
+ "add x27, x27, #0x10\n"
+ "fmax v19.8h, v19.8h, v14.8h\n"
+ "fmax v18.8h, v18.8h, v14.8h\n"
"add x16, x16, #0xa0\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "fmax v16.8h, v16.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v13.8h\n"
- "fmax v20.8h, v20.8h, v13.8h\n"
- "fmax v22.8h, v22.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v13.8h\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
- "fmax v21.8h, v21.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v30.8h, v30.8h, v13.8h\n"
- "fmin v28.8h, v28.8h, v15.8h\n"
- "fmin v25.8h, v25.8h, v15.8h\n"
- "st1 { v28.8h }, [x8]\n"
- "fmin v17.8h, v17.8h, v15.8h\n"
- "fmin v29.8h, v29.8h, v15.8h\n"
- "str q25, [x8, x5]\n"
- "fmin v31.8h, v31.8h, v15.8h\n"
- "fmin v26.8h, v26.8h, v15.8h\n"
- "str q17, [x8, x23]\n"
- "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v14.8h\n"
+ "fmax v23.8h, v23.8h, v14.8h\n"
+ "fmax v26.8h, v26.8h, v14.8h\n"
+ "fmax v30.8h, v30.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v14.8h\n"
+ "fmax v21.8h, v21.8h, v14.8h\n"
+ "fmax v27.8h, v27.8h, v14.8h\n"
+ "fmax v20.8h, v20.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v14.8h\n"
+ "fmax v17.8h, v17.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v14.8h\n"
"fmin v24.8h, v24.8h, v15.8h\n"
- "str q29, [x8, x22]\n"
- "add x8, x8, #0x10\n"
- "fmin v20.8h, v20.8h, v15.8h\n"
"fmin v22.8h, v22.8h, v15.8h\n"
- "st1 { v31.8h }, [x10]\n"
- "fmin v23.8h, v23.8h, v15.8h\n"
"fmin v19.8h, v19.8h, v15.8h\n"
- "str q26, [x10, x5]\n"
- "fmin v21.8h, v21.8h, v15.8h\n"
- "fmin v27.8h, v27.8h, v15.8h\n"
- "str q16, [x10, x23]\n"
"fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "st1 { v24.8h }, [x17]\n"
"fmin v30.8h, v30.8h, v15.8h\n"
- "str q24, [x10, x22]\n"
- "add x10, x10, #0x10\n"
- "st1 { v20.8h }, [x27]\n"
- "str q22, [x27, x5]\n"
- "str q23, [x27, x23]\n"
- "str q19, [x27, x22]\n"
- "add x27, x27, #0x10\n"
- "st1 { v21.8h }, [x24]\n"
- "str q27, [x24, x5]\n"
- "str q18, [x24, x23]\n"
- "str q30, [x24, x22]\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "str q22, [x17, x5]\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "str q19, [x17, x21]\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "str q18, [x17, x20]\n"
+ "add x17, x17, #0x10\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "st1 { v29.8h }, [x28]\n"
+ "str q31, [x28, x5]\n"
+ "str q23, [x28, x21]\n"
+ "str q26, [x28, x20]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v30.8h }, [x26]\n"
+ "str q28, [x26, x5]\n"
+ "str q21, [x26, x21]\n"
+ "str q27, [x26, x20]\n"
+ "add x26, x26, #0x10\n"
+ "st1 { v20.8h }, [x24]\n"
+ "str q25, [x24, x5]\n"
+ "str q17, [x24, x21]\n"
+ "str q16, [x24, x20]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
- "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
- "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
- "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
- "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
- "fmla v16.8h, v5.8h, v12.8h\n"
- "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
- "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
- "ldr q24, [x12, x17]\n"
- "fmla v23.8h, v0.8h, v10.8h\n"
- "ld1 { v21.8h }, [x26]\n"
- "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
- "ldr q20, [x26, x25]\n"
- "fmla v31.8h, v4.8h, v12.8h\n"
+ "mov v31.16b, v13.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x9, x15]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ld1 { v22.8h }, [x25]\n"
+ "mov v10.16b, v13.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+ "ldr q16, [x25, x12]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
- "fmla v18.8h, v1.8h, v12.8h\n"
- "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
- "ldr q9, [x12, x11]\n"
- "fmla v16.8h, v7.8h, v24.8h\n"
- "fmla v17.8h, v8.8h, v12.8h\n"
- "fmla v19.8h, v7.8h, v12.8h\n"
- "fmla v29.8h, v6.8h, v12.8h\n"
- "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
- "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
- "ldr q22, [x7, x4]\n"
- "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
- "ldr q21, [x7, x28]\n"
- "fmla v31.8h, v6.8h, v24.8h\n"
+ "fmla v19.8h, v1.8h, v12.8h\n"
+ "fmla v20.8h, v8.8h, v12.8h\n"
+ "mov v9.16b, v13.16b\n fmla v9.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x9, x14]\n"
+ "fmla v31.8h, v7.8h, v24.8h\n"
+ "fmla v21.8h, v7.8h, v12.8h\n"
+ "fmla v10.8h, v6.8h, v12.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v3.8h, v12.8h\n"
+ "mov v11.16b, v13.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+ "ldr q23, [x7, x4]\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x7, x13]\n"
+ "fmla v29.8h, v6.8h, v24.8h\n"
"fmla v30.8h, v4.8h, v24.8h\n"
- "fmla v18.8h, v3.8h, v24.8h\n"
- "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
- "fmla v14.8h, v0.8h, v24.8h\n"
- "fmla v28.8h, v8.8h, v24.8h\n"
+ "fmla v19.8h, v3.8h, v24.8h\n"
+ "mov v12.16b, v13.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+ "fmla v18.8h, v8.8h, v24.8h\n"
"fmla v27.8h, v5.8h, v24.8h\n"
- "fmla v26.8h, v2.8h, v24.8h\n"
- "ld1 { v24.8h }, [x15]\n"
- "fmla v16.8h, v8.8h, v9.8h\n"
- "fmla v23.8h, v1.8h, v22.8h\n"
- "fmla v17.8h, v0.8h, v22.8h\n"
- "ldr q22, [x15, x25]\n"
- "fmla v19.8h, v2.8h, v21.8h\n"
- "fmla v29.8h, v1.8h, v21.8h\n"
- "ld1 { v20.8h }, [x9]\n"
- "fmla v31.8h, v7.8h, v9.8h\n"
- "fmla v11.8h, v6.8h, v9.8h\n"
- "fmla v30.8h, v5.8h, v9.8h\n"
- "fmla v18.8h, v4.8h, v9.8h\n"
- "fmla v10.8h, v3.8h, v9.8h\n"
- "fmla v12.8h, v2.8h, v9.8h\n"
- "fmla v14.8h, v1.8h, v9.8h\n"
- "fmla v25.8h, v0.8h, v9.8h\n"
- "ldr q21, [x15, x17]\n"
- "fmla v28.8h, v0.8h, v24.8h\n"
- "fmla v27.8h, v6.8h, v20.8h\n"
- "fmla v26.8h, v3.8h, v20.8h\n"
- "ldr q20, [x9, x25]\n"
- "fmla v16.8h, v1.8h, v21.8h\n"
- "fmla v23.8h, v3.8h, v24.8h\n"
- "fmla v29.8h, v5.8h, v22.8h\n"
- "fmla v11.8h, v2.8h, v22.8h\n"
- "ldr q22, [x15, x11]\n"
- "fmla v17.8h, v4.8h, v21.8h\n"
- "fmla v19.8h, v3.8h, v21.8h\n"
- "fmla v31.8h, v0.8h, v21.8h\n"
- "fmla v10.8h, v8.8h, v20.8h\n"
- "fmla v25.8h, v5.8h, v20.8h\n"
- "ldr q20, [x26, x4]\n"
- "fmla v28.8h, v2.8h, v21.8h\n"
- "fmla v16.8h, v2.8h, v22.8h\n"
- "fmla v23.8h, v5.8h, v21.8h\n"
- "ldr q21, [x14, x4]\n"
- "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v9.8h, v2.8h, v24.8h\n"
+ "ld1 { v24.8h }, [x11]\n"
+ "fmla v31.8h, v8.8h, v22.8h\n"
+ "fmla v17.8h, v1.8h, v23.8h\n"
+ "fmla v20.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x11, x12]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v10.8h, v1.8h, v16.8h\n"
+ "ld1 { v16.8h }, [x27]\n"
+ "fmla v29.8h, v7.8h, v22.8h\n"
+ "fmla v28.8h, v6.8h, v22.8h\n"
+ "fmla v30.8h, v5.8h, v22.8h\n"
"fmla v19.8h, v4.8h, v22.8h\n"
- "fmla v29.8h, v3.8h, v22.8h\n"
+ "fmla v11.8h, v3.8h, v22.8h\n"
+ "fmla v12.8h, v2.8h, v22.8h\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "fmla v26.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x11, x15]\n"
+ "fmla v18.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
+ "fmla v9.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x27, x12]\n"
+ "fmla v17.8h, v3.8h, v24.8h\n"
"fmla v31.8h, v1.8h, v22.8h\n"
- "fmla v11.8h, v0.8h, v22.8h\n"
- "ldr q22, [x14, x28]\n"
- "fmla v26.8h, v7.8h, v20.8h\n"
- "fmla v12.8h, v6.8h, v20.8h\n"
- "ldr q20, [x26, x28]\n"
- "fmla v28.8h, v4.8h, v21.8h\n"
- "fmla v16.8h, v3.8h, v21.8h\n"
- "fmla v27.8h, v1.8h, v21.8h\n"
- "fmla v30.8h, v0.8h, v21.8h\n"
- "fmla v23.8h, v7.8h, v21.8h\n"
- "fmla v17.8h, v6.8h, v21.8h\n"
- "ldr q21, [x7, x17]\n"
- "fmla v14.8h, v8.8h, v20.8h\n"
- "fmla v25.8h, v7.8h, v20.8h\n"
- "ldr q20, [x12, x4]\n"
- "fmla v19.8h, v8.8h, v22.8h\n"
- "fmla v29.8h, v7.8h, v22.8h\n"
- "fmla v31.8h, v5.8h, v22.8h\n"
- "fmla v11.8h, v4.8h, v22.8h\n"
+ "fmla v10.8h, v5.8h, v23.8h\n"
+ "fmla v28.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x11, x14]\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v21.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v11.8h, v8.8h, v16.8h\n"
+ "fmla v26.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x25, x4]\n"
"fmla v18.8h, v2.8h, v22.8h\n"
- "fmla v10.8h, v1.8h, v22.8h\n"
- "ldr q22, [x7, x11]\n"
+ "fmla v31.8h, v2.8h, v23.8h\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "ldr q22, [x10, x4]\n"
+ "fmla v20.8h, v5.8h, v23.8h\n"
+ "fmla v21.8h, v4.8h, v23.8h\n"
+ "fmla v10.8h, v3.8h, v23.8h\n"
+ "fmla v29.8h, v1.8h, v23.8h\n"
+ "fmla v28.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x10, x13]\n"
+ "fmla v9.8h, v7.8h, v16.8h\n"
+ "fmla v12.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x25, x13]\n"
+ "fmla v18.8h, v4.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v30.8h, v0.8h, v22.8h\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v20.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x7, x15]\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "fmla v26.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x9, x4]\n"
+ "fmla v21.8h, v8.8h, v23.8h\n"
+ "fmla v10.8h, v7.8h, v23.8h\n"
+ "fmla v29.8h, v5.8h, v23.8h\n"
+ "fmla v28.8h, v4.8h, v23.8h\n"
+ "fmla v19.8h, v2.8h, v23.8h\n"
+ "fmla v11.8h, v1.8h, v23.8h\n"
+ "ldr q23, [x7, x14]\n"
"add x7, x7, #0x10\n"
- "fmla v28.8h, v7.8h, v20.8h\n"
- "fmla v16.8h, v6.8h, v20.8h\n"
- "fmla v27.8h, v4.8h, v20.8h\n"
- "fmla v30.8h, v3.8h, v20.8h\n"
- "fmla v26.8h, v1.8h, v20.8h\n"
- "fmla v12.8h, v0.8h, v20.8h\n"
- "ldr q20, [x12, x28]\n"
- "fmla v23.8h, v2.8h, v21.8h\n"
- "fmla v17.8h, v1.8h, v21.8h\n"
- "fmla v19.8h, v0.8h, v21.8h\n"
- "ld1 { v21.8h }, [x14]\n"
- "fmla v14.8h, v2.8h, v20.8h\n"
- "fmla v29.8h, v0.8h, v22.8h\n"
- "fmla v28.8h, v3.8h, v21.8h\n"
- "fmla v27.8h, v0.8h, v21.8h\n"
- "fmla v31.8h, v8.8h, v20.8h\n"
- "fmla v11.8h, v7.8h, v20.8h\n"
- "fmla v18.8h, v5.8h, v20.8h\n"
- "fmla v10.8h, v4.8h, v20.8h\n"
- "fmla v25.8h, v1.8h, v20.8h\n"
- "ldr q24, [x9, x17]\n"
+ "fmla v18.8h, v7.8h, v16.8h\n"
+ "fmla v31.8h, v6.8h, v16.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v30.8h, v3.8h, v16.8h\n"
+ "fmla v9.8h, v1.8h, v16.8h\n"
+ "fmla v12.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x13]\n"
"fmla v17.8h, v2.8h, v22.8h\n"
- "fmla v19.8h, v1.8h, v22.8h\n"
- "ldr q20, [x14, x25]\n"
- "add x14, x14, #0x10\n"
- "fmla v23.8h, v6.8h, v21.8h\n"
- "ld1 { v21.8h }, [x12]\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v0.8h, v22.8h\n"
+ "ld1 { v22.8h }, [x10]\n"
+ "fmla v10.8h, v0.8h, v23.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v8.8h, v16.8h\n"
+ "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "fmla v18.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v22.8h\n"
+ "fmla v11.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v1.8h, v16.8h\n"
+ "ldr q24, [x27, x15]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "ldr q16, [x10, x12]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "ld1 { v22.8h }, [x9]\n"
"fmla v12.8h, v4.8h, v24.8h\n"
- "fmla v14.8h, v3.8h, v24.8h\n"
- "fmla v29.8h, v8.8h, v20.8h\n"
- "fmla v11.8h, v5.8h, v20.8h\n"
- "fmla v10.8h, v2.8h, v20.8h\n"
- "ldr q20, [x12, x25]\n"
- "add x12, x12, #0x10\n"
- "fmla v28.8h, v6.8h, v21.8h\n"
- "fmla v27.8h, v3.8h, v21.8h\n"
- "fmla v26.8h, v0.8h, v21.8h\n"
- "ldr q22, [x26, x17]\n"
- "fmla v25.8h, v2.8h, v20.8h\n"
- "fmla v12.8h, v7.8h, v22.8h\n"
- "fmla v14.8h, v6.8h, v22.8h\n"
- "fmla v27.8h, v8.8h, v24.8h\n"
+ "fmla v25.8h, v3.8h, v24.8h\n"
"fmla v30.8h, v7.8h, v24.8h\n"
- "fmla v18.8h, v6.8h, v24.8h\n"
- "fmla v26.8h, v5.8h, v24.8h\n"
- "ldr q21, [x9, x11]\n"
- "fmla v10.8h, v5.8h, v20.8h\n"
- "fmla v12.8h, v5.8h, v21.8h\n"
- "fmla v14.8h, v4.8h, v21.8h\n"
- "fmla v25.8h, v3.8h, v21.8h\n"
- "fmla v11.8h, v8.8h, v20.8h\n"
- "ldr q20, [x26, x11]\n"
- "fmla v26.8h, v8.8h, v22.8h\n"
- "ldr q9, [x15, x4]\n"
- "fmla v30.8h, v8.8h, v21.8h\n"
- "fmla v18.8h, v7.8h, v21.8h\n"
- "add x26, x26, #0x10\n"
- "fmla v10.8h, v6.8h, v21.8h\n"
- "ldr q21, [x15, x28]\n"
- "fmla v12.8h, v8.8h, v20.8h\n"
- "add x15, x15, #0x10\n"
- "fmla v14.8h, v7.8h, v20.8h\n"
- "fmla v25.8h, v6.8h, v20.8h\n"
- "ldr q24, [x9, x4]\n"
- "fmla v23.8h, v4.8h, v9.8h\n"
- "fmla v17.8h, v3.8h, v9.8h\n"
- "fmax v23.8h, v23.8h, v13.8h\n"
- "fmla v28.8h, v1.8h, v9.8h\n"
- "fmla v16.8h, v0.8h, v9.8h\n"
- "ldr q0, [x9, x28]\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmla v19.8h, v5.8h, v21.8h\n"
- "fmla v29.8h, v4.8h, v21.8h\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmla v10.8h, v8.8h, v16.8h\n"
+ "fmla v28.8h, v5.8h, v16.8h\n"
+ "fmla v11.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x9, x12]\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
"add x9, x9, #0x10\n"
- "fmla v31.8h, v2.8h, v21.8h\n"
- "fmla v11.8h, v1.8h, v21.8h\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
- "fmla v27.8h, v7.8h, v24.8h\n"
- "fmla v30.8h, v6.8h, v24.8h\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmla v26.8h, v4.8h, v24.8h\n"
- "fmla v12.8h, v3.8h, v24.8h\n"
- "fmax v16.8h, v16.8h, v13.8h\n"
- "fmla v18.8h, v8.8h, v0.8h\n"
- "fmla v10.8h, v7.8h, v0.8h\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
- "fmla v14.8h, v5.8h, v0.8h\n"
- "fmla v25.8h, v4.8h, v0.8h\n"
- "fmax v11.8h, v11.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v30.8h, v30.8h, v13.8h\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v10.8h, v10.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "fmax v12.8h, v12.8h, v13.8h\n"
- "fmax v14.8h, v14.8h, v13.8h\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
- "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v9.8h, v0.8h, v22.8h\n"
+ "ldr q23, [x25, x15]\n"
+ "fmla v19.8h, v6.8h, v24.8h\n"
+ "fmla v26.8h, v2.8h, v16.8h\n"
+ "fmla v28.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v6.8h, v23.8h\n"
+ "fmla v11.8h, v5.8h, v16.8h\n"
+ "ldr q22, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v27.8h, v8.8h, v24.8h\n"
+ "fmla v9.8h, v5.8h, v24.8h\n"
+ "ldr q16, [x27, x14]\n"
+ "fmla v12.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v16.8h\n"
+ "fmla v26.8h, v3.8h, v16.8h\n"
+ "fmla v30.8h, v8.8h, v16.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmla v11.8h, v6.8h, v16.8h\n"
+ "ldr q24, [x11, x13]\n"
+ "fmla v9.8h, v8.8h, v23.8h\n"
+ "ldr q16, [x11, x4]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v12.8h, v8.8h, v22.8h\n"
+ "fmla v25.8h, v7.8h, v22.8h\n"
+ "fmla v26.8h, v6.8h, v22.8h\n"
+ "ldr q23, [x27, x4]\n"
+ "fmla v21.8h, v5.8h, v24.8h\n"
+ "fmla v10.8h, v4.8h, v24.8h\n"
+ "fmla v17.8h, v4.8h, v16.8h\n"
+ "fmla v20.8h, v3.8h, v16.8h\n"
+ "fmla v18.8h, v1.8h, v16.8h\n"
+ "fmla v31.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x27, x13]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v29.8h, v2.8h, v24.8h\n"
+ "fmla v28.8h, v1.8h, v24.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v30.8h, v6.8h, v23.8h\n"
+ "fmax v21.8h, v21.8h, v14.8h\n"
+ "fmla v9.8h, v4.8h, v23.8h\n"
+ "fmla v12.8h, v3.8h, v23.8h\n"
+ "fmax v17.8h, v17.8h, v14.8h\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "fmla v11.8h, v7.8h, v16.8h\n"
+ "fmax v20.8h, v20.8h, v14.8h\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "fmla v26.8h, v4.8h, v16.8h\n"
+ "fmax v10.8h, v10.8h, v14.8h\n"
+ "fmax v18.8h, v18.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v14.8h\n"
+ "fmax v27.8h, v27.8h, v14.8h\n"
+ "fmax v30.8h, v30.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v14.8h\n"
+ "fmax v11.8h, v11.8h, v14.8h\n"
+ "fmax v9.8h, v9.8h, v14.8h\n"
+ "fmax v12.8h, v12.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v14.8h\n"
+ "fmax v26.8h, v26.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v15.8h\n"
- "st1 { v23.8h }, [x8]\n"
- "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v10.8h, v10.8h, v15.8h\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"fmin v29.8h, v29.8h, v15.8h\n"
- "str q17, [x8, x5]\n"
"fmin v28.8h, v28.8h, v15.8h\n"
- "fmin v16.8h, v16.8h, v15.8h\n"
- "str q19, [x8, x23]\n"
- "fmin v31.8h, v31.8h, v15.8h\n"
- "fmin v11.8h, v11.8h, v15.8h\n"
- "str q29, [x8, x22]\n"
- "add x8, x8, #0x10\n"
+ "st1 { v17.8h }, [x17]\n"
"fmin v27.8h, v27.8h, v15.8h\n"
"fmin v30.8h, v30.8h, v15.8h\n"
- "st1 { v28.8h }, [x10]\n"
- "fmin v18.8h, v18.8h, v15.8h\n"
- "fmin v10.8h, v10.8h, v15.8h\n"
- "str q16, [x10, x5]\n"
- "fmin v26.8h, v26.8h, v15.8h\n"
+ "str q20, [x17, x5]\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v11.8h, v11.8h, v15.8h\n"
+ "str q21, [x17, x21]\n"
+ "fmin v9.8h, v9.8h, v15.8h\n"
"fmin v12.8h, v12.8h, v15.8h\n"
- "str q31, [x10, x23]\n"
- "fmin v14.8h, v14.8h, v15.8h\n"
+ "str q10, [x17, x20]\n"
+ "add x17, x17, #0x10\n"
"fmin v25.8h, v25.8h, v15.8h\n"
- "str q11, [x10, x22]\n"
- "add x10, x10, #0x10\n"
- "st1 { v27.8h }, [x27]\n"
- "str q30, [x27, x5]\n"
- "str q18, [x27, x23]\n"
- "str q10, [x27, x22]\n"
- "add x27, x27, #0x10\n"
- "st1 { v26.8h }, [x24]\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "st1 { v18.8h }, [x28]\n"
+ "str q31, [x28, x5]\n"
+ "str q29, [x28, x21]\n"
+ "str q28, [x28, x20]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v27.8h }, [x26]\n"
+ "str q30, [x26, x5]\n"
+ "str q19, [x26, x21]\n"
+ "str q11, [x26, x20]\n"
+ "add x26, x26, #0x10\n"
+ "st1 { v9.8h }, [x24]\n"
"str q12, [x24, x5]\n"
- "str q14, [x24, x23]\n"
- "str q25, [x24, x22]\n"
+ "str q25, [x24, x21]\n"
+ "str q26, [x24, x20]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 141f\n"
- "ldr q14, [x16, #0x0]\n"
+ "ldr q13, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "add x23, x14, x17\n"
+ "add x23, x10, x15\n"
"add x22, x7, XZR\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x21, x7, x25\n"
- "add x20, x14, x11\n"
+ "add x21, x7, x12\n"
+ "add x20, x10, x14\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"ldr q5, [x16, #0x60]\n"
@@ -699,27 +699,27 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr h11, [x21, #0x0]\n"
"ldr h12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
- "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
- "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
- "add x20, x26, XZR\n"
- "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
- "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
- "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
- "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
- "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
- "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
- "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
- "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "add x20, x25, XZR\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
- "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
"fmla v26.8h, v1.8h, v12.8h\n"
- "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -740,8 +740,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"11:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
"ldr h10, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
- "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
- "add x20, x26, x25\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "add x20, x25, x12\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
@@ -762,8 +762,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"15:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
- "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
- "add x20, x12, x17\n"
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x9, x15\n"
"tbz %x[n_channels], #2, 18f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
@@ -792,8 +792,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
- "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
- "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -816,7 +816,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
"fmla v16.8h, v1.8h, v12.8h\n"
"fmla v17.8h, v0.8h, v12.8h\n"
- "add x20, x7, x28\n"
+ "add x20, x7, x13\n"
"tbz %x[n_channels], #2, 26f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
@@ -839,7 +839,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: End
"fmla v18.8h, v2.8h, v11.8h\n"
"fmla v19.8h, v1.8h, v11.8h\n"
- "add x20, x12, x11\n"
+ "add x20, x9, x14\n"
"tbz %x[n_channels], #2, 30f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
@@ -862,7 +862,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"32:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x20, x15, XZR\n"
+ "add x20, x11, XZR\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
@@ -892,7 +892,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
"fmla v16.8h, v3.8h, v9.8h\n"
"fmla v20.8h, v0.8h, v9.8h\n"
- "add x20, x15, x25\n"
+ "add x20, x11, x12\n"
"tbz %x[n_channels], #2, 38f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -915,7 +915,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"40:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: End
"fmla v19.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v2.8h, v12.8h\n"
- "add x20, x9, XZR\n"
+ "add x20, x27, XZR\n"
"tbz %x[n_channels], #2, 42f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -938,7 +938,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v24.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x15, x17\n"
+ "add x20, x11, x15\n"
"tbz %x[n_channels], #2, 46f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
@@ -961,7 +961,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"48:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x20, x9, x25\n"
+ "add x20, x27, x12\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
@@ -988,7 +988,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"52:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
"fmla v27.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v5.8h, v11.8h\n"
- "add x20, x15, x11\n"
+ "add x20, x11, x14\n"
"tbz %x[n_channels], #2, 54f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
@@ -1011,7 +1011,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"56:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x20, x26, x4\n"
+ "add x20, x25, x4\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
@@ -1038,7 +1038,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"60:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
"fmla v28.8h, v7.8h, v11.8h\n"
"fmla v29.8h, v6.8h, v11.8h\n"
- "add x20, x14, x4\n"
+ "add x20, x10, x4\n"
"tbz %x[n_channels], #2, 62f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
@@ -1061,7 +1061,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"64:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x20, x26, x28\n"
+ "add x20, x25, x13\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
@@ -1088,7 +1088,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"68:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
"fmla v30.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v7.8h, v11.8h\n"
- "add x20, x14, x28\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #2, 70f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
@@ -1111,7 +1111,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"72:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x20, x7, x17\n"
+ "add x20, x7, x15\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
@@ -1138,7 +1138,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"76:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x20, x12, x4\n"
+ "add x20, x9, x4\n"
"fmla v18.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 78f\n"
"ldr d11, [x20], #0x8\n"
@@ -1162,7 +1162,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"80:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x20, x7, x11\n"
+ "add x20, x7, x14\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
@@ -1189,7 +1189,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"84:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x20, x14, XZR\n"
+ "add x20, x10, XZR\n"
"fmla v19.8h, v0.8h, v12.8h\n"
"tbz %x[n_channels], #2, 86f\n"
"ldr d10, [x20], #0x8\n"
@@ -1213,7 +1213,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"88:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x20, x12, x28\n"
+ "add x20, x9, x13\n"
"fmla v24.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 90f\n"
"ldr d11, [x20], #0x8\n"
@@ -1237,7 +1237,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"92:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x20, x14, x25\n"
+ "add x20, x10, x12\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
@@ -1264,7 +1264,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"96:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x20, x12, XZR\n"
+ "add x20, x9, XZR\n"
"fmla v27.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 98f\n"
"ldr d10, [x20], #0x8\n"
@@ -1288,7 +1288,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"100:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x20, x9, x17\n"
+ "add x20, x27, x15\n"
"fmla v28.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 102f\n"
"ldr d11, [x20], #0x8\n"
@@ -1312,7 +1312,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"104:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x20, x12, x25\n"
+ "add x20, x9, x12\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
@@ -1339,7 +1339,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"108:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x20, x26, x17\n"
+ "add x20, x25, x15\n"
"fmla v31.8h, v2.8h, v12.8h\n"
"tbz %x[n_channels], #2, 110f\n"
"ldr d10, [x20], #0x8\n"
@@ -1363,7 +1363,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"112:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x9, x11\n"
+ "add x20, x27, x14\n"
"fmla v30.8h, v6.8h, v10.8h\n"
"tbz %x[n_channels], #2, 114f\n"
"ldr d11, [x20], #0x8\n"
@@ -1387,7 +1387,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"116:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x20, x26, x11\n"
+ "add x20, x25, x14\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
@@ -1414,7 +1414,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"120:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x20, x15, x4\n"
+ "add x20, x11, x4\n"
"fmla v31.8h, v6.8h, v12.8h\n"
"tbz %x[n_channels], #2, 122f\n"
"ldr d10, [x20], #0x8\n"
@@ -1438,7 +1438,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"124:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x20, x15, x28\n"
+ "add x20, x11, x13\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 126f\n"
@@ -1463,7 +1463,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"128:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x20, x9, x4\n"
+ "add x20, x27, x4\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
"tbz %x[n_channels], #2, 130f\n"
@@ -1488,7 +1488,7 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"132:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x20, x9, x28\n"
+ "add x20, x27, x13\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"tbz %x[n_channels], #2, 134f\n"
@@ -1513,24 +1513,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"136:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmax v16.8h, v16.8h, v14.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
- "fmax v20.8h, v20.8h, v13.8h\n"
- "fmax v21.8h, v21.8h, v13.8h\n"
- "fmax v22.8h, v22.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v13.8h\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
- "fmax v30.8h, v30.8h, v13.8h\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmax v17.8h, v17.8h, v14.8h\n"
+ "fmax v18.8h, v18.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v14.8h\n"
+ "fmax v20.8h, v20.8h, v14.8h\n"
+ "fmax v21.8h, v21.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v14.8h\n"
+ "fmax v23.8h, v23.8h, v14.8h\n"
+ "fmax v24.8h, v24.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v14.8h\n"
+ "fmax v26.8h, v26.8h, v14.8h\n"
+ "fmax v27.8h, v27.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v14.8h\n"
+ "fmax v30.8h, v30.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v14.8h\n"
"fmin v16.8h, v16.8h, v15.8h\n"
"fmin v17.8h, v17.8h, v15.8h\n"
"fmin v18.8h, v18.8h, v15.8h\n"
@@ -1548,18 +1548,18 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 138f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.d }[0], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v16.d }[0], [x23], x5\n"
"st1 { v20.d }[0], [x22], x5\n"
+ "add x26, x26, #0x8\n"
+ "add x24, x24, #0x8\n"
"st1 { v24.d }[0], [x21], x5\n"
- "add x8, x8, #0x8\n"
- "add x10, x10, #0x8\n"
"st1 { v28.d }[0], [x20], x5\n"
- "add x27, x27, #0x8\n"
- "add x24, x24, #0x8\n"
"st1 { v17.d }[0], [x23], x5\n"
"st1 { v21.d }[0], [x22], x5\n"
"st1 { v25.d }[0], [x21], x5\n"
@@ -1573,18 +1573,18 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 137f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.s }[2], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v16.s }[2], [x23], x5\n"
"st1 { v20.s }[2], [x22], x5\n"
+ "add x26, x26, #0x4\n"
+ "add x24, x24, #0x4\n"
"st1 { v24.s }[2], [x21], x5\n"
- "add x8, x8, #0x4\n"
- "add x10, x10, #0x4\n"
"st1 { v28.s }[2], [x20], x5\n"
- "add x27, x27, #0x4\n"
- "add x24, x24, #0x4\n"
"st1 { v17.s }[2], [x23], x5\n"
"st1 { v21.s }[2], [x22], x5\n"
"st1 { v25.s }[2], [x21], x5\n"
@@ -1598,15 +1598,15 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 140f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.h }[6], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.h }[6], [x23], x5\n"
+ "st1 { v17.h }[6], [x23], x5\n"
"st1 { v20.h }[6], [x22], x5\n"
"st1 { v24.h }[6], [x21], x5\n"
"st1 { v28.h }[6], [x20], x5\n"
- "st1 { v17.h }[6], [x23], x5\n"
"st1 { v21.h }[6], [x22], x5\n"
"st1 { v25.h }[6], [x21], x5\n"
"st1 { v29.h }[6], [x20], x5\n"
@@ -1621,15 +1621,15 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"b 140f\n"
"137:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 140f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.h }[4], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.h }[4], [x23], x5\n"
+ "st1 { v17.h }[4], [x23], x5\n"
"st1 { v20.h }[4], [x22], x5\n"
"st1 { v24.h }[4], [x21], x5\n"
"st1 { v28.h }[4], [x20], x5\n"
- "st1 { v17.h }[4], [x23], x5\n"
"st1 { v21.h }[4], [x22], x5\n"
"st1 { v25.h }[4], [x21], x5\n"
"st1 { v29.h }[4], [x20], x5\n"
@@ -1644,18 +1644,18 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"b 140f\n"
"138:" // Tile loop: Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 139f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.s }[0], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v16.s }[0], [x23], x5\n"
"st1 { v20.s }[0], [x22], x5\n"
+ "add x26, x26, #0x4\n"
+ "add x24, x24, #0x4\n"
"st1 { v24.s }[0], [x21], x5\n"
- "add x8, x8, #0x4\n"
- "add x10, x10, #0x4\n"
"st1 { v28.s }[0], [x20], x5\n"
- "add x27, x27, #0x4\n"
- "add x24, x24, #0x4\n"
"st1 { v17.s }[0], [x23], x5\n"
"st1 { v21.s }[0], [x22], x5\n"
"st1 { v25.s }[0], [x21], x5\n"
@@ -1669,15 +1669,15 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 140f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.h }[2], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.h }[2], [x23], x5\n"
+ "st1 { v17.h }[2], [x23], x5\n"
"st1 { v20.h }[2], [x22], x5\n"
"st1 { v24.h }[2], [x21], x5\n"
"st1 { v28.h }[2], [x20], x5\n"
- "st1 { v17.h }[2], [x23], x5\n"
"st1 { v21.h }[2], [x22], x5\n"
"st1 { v25.h }[2], [x21], x5\n"
"st1 { v29.h }[2], [x20], x5\n"
@@ -1691,15 +1691,15 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v31.h }[2], [x20]\n"
"b 140f\n"
"139:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.h }[0], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.h }[0], [x23], x5\n"
+ "st1 { v17.h }[0], [x23], x5\n"
"st1 { v20.h }[0], [x22], x5\n"
"st1 { v24.h }[0], [x21], x5\n"
"st1 { v28.h }[0], [x20], x5\n"
- "st1 { v17.h }[0], [x23], x5\n"
"st1 { v21.h }[0], [x22], x5\n"
"st1 { v25.h }[0], [x21], x5\n"
"st1 { v29.h }[0], [x20], x5\n"
@@ -1713,20 +1713,20 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v31.h }[0], [x20]\n"
"140:" // Tile loop: Oddments: Store: Bit 2: End
"141:" // Tile loop: End
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x26, x26, #0x1\n"
- "add x21, x27, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x27, x27, x21, LT\n"
- "csel x26, x26, XZR, LT\n"
- "cmp x27, x20\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "add x20, x28, #0x1\n"
+ "cmp x27, x22\n"
+ "csel x28, x28, x20, LT\n"
+ "csel x27, x27, XZR, LT\n"
+ "cmp x28, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 96feeeeece..2a5656a9b3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,9 +102,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"lsr x7, %x[n_channels], #0x3\n"
"ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x21]\n"
"ld1r { v14.8h }, [x20]\n"
"add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
"mov x15, #0x0\n"
@@ -122,583 +122,583 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q7, [x17, #0x80]\n"
"ldr q8, [x17, #0x90]\n"
"add x17, x17, #0xa0\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "ldr q9, [x21, x15]\n"
- "ldr q10, [x20, x15]\n"
+ "ldp x23, x22, [x16, #0x0]\n"
"ldp x21, x20, [x16, #0x10]\n"
+ "ldr q9, [x23, x15]\n"
+ "ldr q10, [x22, x15]\n"
"ldr q11, [x21, x15]\n"
"ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
- "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v8.8h, v9.8h\n"
"ldr x27, [x16, #0x20]\n"
"ldr x24, [x16, #0x30]\n"
- "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
- "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v3.8h, v9.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v1.8h, v9.8h\n"
"ldr x23, [x16, #0x28]\n"
"ldr x22, [x16, #0x38]\n"
- "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
- "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v7.8h, v9.8h\n"
"ldr x26, [x16, #0x40]\n"
"ldr x20, [x16, #0x48]\n"
- "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
- "fmla v23.8h, v5.8h, v12.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v6.8h, v9.8h\n"
+ "mov v13.16b, v30.16b\n fmla v13.8h, v5.8h, v9.8h\n"
"ldr x25, [x16, #0x50]\n"
"ldr x21, [x16, #0x58]\n"
- "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
- "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v21.8h, v5.8h, v12.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v2.8h, v9.8h\n"
"ldr q9, [x24, x15]\n"
"ldr x13, [x16, #0x70]\n"
- "fmla v17.8h, v0.8h, v10.8h\n"
- "ldr q22, [x27, x15]\n"
- "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
- "ldr q18, [x23, x15]\n"
- "fmla v25.8h, v4.8h, v12.8h\n"
- "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q17, [x27, x15]\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v2.8h, v11.8h\n"
+ "ldr q29, [x23, x15]\n"
+ "fmla v27.8h, v4.8h, v12.8h\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
"ldr x24, [x16, #0x60]\n"
"ldr x23, [x16, #0x68]\n"
- "fmla v20.8h, v1.8h, v12.8h\n"
- "fmla v16.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
"ldr x12, [x8, #0x0]\n"
"ldr x11, [x8, #0x8]\n"
- "fmla v15.8h, v7.8h, v12.8h\n"
- "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
- "ldr q22, [x20, x15]\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v6.8h, v17.8h\n"
+ "ldr q10, [x20, x15]\n"
"ldr x28, [x16, #0x88]\n"
- "fmla v23.8h, v7.8h, v9.8h\n"
- "fmla v10.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
"ldr x10, [x8, #0x10]\n"
"ldr x9, [x8, #0x18]\n"
- "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+ "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v12.8h\n"
"mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
- "ldr q11, [x22, x15]\n"
+ "ldr q12, [x22, x15]\n"
"ldr x22, [x16, #0x78]\n"
- "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
- "ldr q12, [x26, x15]\n"
- "fmla v25.8h, v6.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v29.8h\n"
+ "ldr q11, [x26, x15]\n"
+ "fmla v27.8h, v6.8h, v9.8h\n"
"ldr x20, [x16, #0x80]\n"
- "fmla v28.8h, v4.8h, v9.8h\n"
- "fmla v20.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v4.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
"add x14, x14, #0x10\n"
- "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
- "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v0.8h, v9.8h\n"
"ldr q30, [x17, #0x0]\n"
- "fmla v27.8h, v8.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v9.8h\n"
- "fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v13.8h, v8.8h, v9.8h\n"
+ "fmla v16.8h, v5.8h, v9.8h\n"
+ "fmla v24.8h, v2.8h, v9.8h\n"
"ldr q9, [x25, x15]\n"
- "fmla v17.8h, v1.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v12.8h\n"
"ldr x27, [x16, #0x90]\n"
- "fmla v16.8h, v0.8h, v11.8h\n"
- "ldr q11, [x21, x15]\n"
- "fmla v15.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x21, x15]\n"
+ "fmla v20.8h, v2.8h, v11.8h\n"
"ldr x21, [x16, #0x98]\n"
- "fmla v23.8h, v8.8h, v22.8h\n"
- "fmla v10.8h, v1.8h, v12.8h\n"
- "ldr q12, [x24, x15]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x24, x15]\n"
"ldr x26, [x16, #0xa0]\n"
- "fmla v25.8h, v7.8h, v22.8h\n"
- "fmla v21.8h, v6.8h, v22.8h\n"
- "fmla v28.8h, v5.8h, v22.8h\n"
- "fmla v20.8h, v4.8h, v22.8h\n"
- "fmla v19.8h, v3.8h, v22.8h\n"
- "fmla v26.8h, v2.8h, v22.8h\n"
- "fmla v18.8h, v1.8h, v22.8h\n"
- "fmla v24.8h, v0.8h, v22.8h\n"
- "ldr q22, [x23, x15]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v22.8h, v6.8h, v10.8h\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v19.8h, v3.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x23, x15]\n"
"ldr x25, [x16, #0xa8]\n"
- "fmla v17.8h, v3.8h, v9.8h\n"
- "fmla v27.8h, v0.8h, v9.8h\n"
- "fmla v31.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v3.8h, v9.8h\n"
+ "fmla v13.8h, v0.8h, v9.8h\n"
+ "fmla v16.8h, v6.8h, v11.8h\n"
+ "fmla v24.8h, v3.8h, v11.8h\n"
"ldr q9, [x13, x15]\n"
"ldr x24, [x16, #0xb0]\n"
- "fmla v16.8h, v4.8h, v22.8h\n"
- "fmla v15.8h, v3.8h, v22.8h\n"
- "fmla v23.8h, v1.8h, v22.8h\n"
- "fmla v10.8h, v5.8h, v11.8h\n"
- "fmla v21.8h, v2.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v21.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
"ldr q12, [x22, x15]\n"
- "fmla v25.8h, v0.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v10.8h\n"
"ldr x23, [x16, #0xb8]\n"
"fmla v19.8h, v8.8h, v9.8h\n"
- "fmla v24.8h, v5.8h, v9.8h\n"
- "ldr q11, [x20, x15]\n"
+ "fmla v17.8h, v5.8h, v9.8h\n"
+ "ldr q9, [x20, x15]\n"
"ldr x22, [x16, #0xc0]\n"
- "fmla v17.8h, v5.8h, v22.8h\n"
- "fmla v27.8h, v2.8h, v22.8h\n"
- "ldr q22, [x28, x15]\n"
+ "fmla v26.8h, v5.8h, v10.8h\n"
+ "fmla v13.8h, v2.8h, v10.8h\n"
+ "ldr q11, [x28, x15]\n"
"ldr x20, [x16, #0xc8]\n"
- "fmla v16.8h, v5.8h, v12.8h\n"
- "fmla v15.8h, v4.8h, v12.8h\n"
- "fmla v23.8h, v2.8h, v12.8h\n"
- "fmla v10.8h, v3.8h, v12.8h\n"
- "fmla v25.8h, v1.8h, v12.8h\n"
- "fmla v21.8h, v0.8h, v12.8h\n"
- "ldr q9, [x21, x15]\n"
+ "fmla v18.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "fmla v27.8h, v1.8h, v12.8h\n"
+ "fmla v22.8h, v0.8h, v12.8h\n"
+ "ldr q10, [x21, x15]\n"
"ldr x28, [x16, #0xd8]\n"
- "fmla v29.8h, v7.8h, v11.8h\n"
- "fmla v26.8h, v6.8h, v11.8h\n"
- "ldr q12, [x27, x15]\n"
+ "fmla v24.8h, v7.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x27, x15]\n"
"ldr x21, [x16, #0xd0]\n"
- "fmla v17.8h, v7.8h, v22.8h\n"
- "fmla v16.8h, v6.8h, v22.8h\n"
- "fmla v27.8h, v4.8h, v22.8h\n"
- "fmla v23.8h, v3.8h, v22.8h\n"
- "fmla v31.8h, v1.8h, v22.8h\n"
- "fmla v28.8h, v0.8h, v22.8h\n"
- "ldr q11, [x26, x15]\n"
+ "fmla v26.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "fmla v13.8h, v4.8h, v11.8h\n"
+ "fmla v21.8h, v3.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v31.8h, v0.8h, v11.8h\n"
+ "ldr q12, [x26, x15]\n"
"ldr x27, [x16, #0xe0]\n"
- "fmla v15.8h, v8.8h, v9.8h\n"
- "fmla v18.8h, v8.8h, v12.8h\n"
- "fmla v24.8h, v7.8h, v12.8h\n"
- "ldr q12, [x25, x15]\n"
- "fmla v19.8h, v1.8h, v9.8h\n"
+ "fmla v20.8h, v8.8h, v10.8h\n"
+ "fmla v23.8h, v8.8h, v9.8h\n"
+ "fmla v17.8h, v7.8h, v9.8h\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v19.8h, v1.8h, v10.8h\n"
"ldr x26, [x16, #0xe8]\n"
- "fmla v10.8h, v7.8h, v9.8h\n"
- "fmla v25.8h, v5.8h, v9.8h\n"
- "fmla v21.8h, v4.8h, v9.8h\n"
- "fmla v20.8h, v2.8h, v9.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v22.8h, v4.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
"ldr q9, [x24, x15]\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla v17.8h, v2.8h, v11.8h\n"
- "fmla v16.8h, v1.8h, v11.8h\n"
- "fmla v15.8h, v0.8h, v11.8h\n"
- "ldr q22, [x23, x15]\n"
- "fmla v27.8h, v7.8h, v12.8h\n"
- "ldr x25, [x16, #0xf8]\n"
- "fmla v23.8h, v6.8h, v12.8h\n"
- "fmla v31.8h, v4.8h, v12.8h\n"
- "fmla v28.8h, v3.8h, v12.8h\n"
- "fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v26.8h, v0.8h, v12.8h\n"
- "ldr q11, [x22, x15]\n"
- "fmla v19.8h, v4.8h, v11.8h\n"
- "ldr x23, [x16, #0x100]\n"
- "fmla v18.8h, v2.8h, v11.8h\n"
- "fmla v16.8h, v2.8h, v9.8h\n"
- "fmla v15.8h, v1.8h, v9.8h\n"
- "fmla v10.8h, v0.8h, v9.8h\n"
- "ldr q9, [x20, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x15]\n"
+ "fmla v13.8h, v7.8h, v11.8h\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v16.8h, v4.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q10, [x22, x15]\n"
+ "fmla v18.8h, v2.8h, v9.8h\n"
+ "ldr x22, [x16, #0x100]\n"
+ "fmla v20.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v0.8h, v9.8h\n"
+ "ldr q11, [x20, x15]\n"
"ldr x20, [x16, #0x108]\n"
- "fmla v17.8h, v6.8h, v22.8h\n"
- "fmla v27.8h, v3.8h, v22.8h\n"
- "fmla v31.8h, v0.8h, v22.8h\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v26.8h, v6.8h, v12.8h\n"
+ "fmla v13.8h, v3.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v12.8h\n"
+ "ldr q9, [x21, x15]\n"
+ "fmla v27.8h, v8.8h, v10.8h\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla v22.8h, v7.8h, v10.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "ldr q10, [x28, x15]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
- "ldr x22, [x16, #0x110]\n"
- "fmla v21.8h, v7.8h, v11.8h\n"
- "fmla v20.8h, v5.8h, v11.8h\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "ldr q12, [x28, x15]\n"
- "fmla v19.8h, v2.8h, v9.8h\n"
"ldr x21, [x16, #0x118]\n"
- "fmla v29.8h, v0.8h, v22.8h\n"
- "fmla v26.8h, v4.8h, v12.8h\n"
- "fmla v18.8h, v3.8h, v12.8h\n"
- "fmla v10.8h, v8.8h, v9.8h\n"
- "fmla v21.8h, v5.8h, v9.8h\n"
- "ldr q11, [x27, x15]\n"
- "fmla v27.8h, v6.8h, v22.8h\n"
- "fmla v31.8h, v3.8h, v22.8h\n"
- "ldr q22, [x26, x15]\n"
- "fmla v28.8h, v7.8h, v12.8h\n"
- "fmla v20.8h, v6.8h, v12.8h\n"
- "fmla v29.8h, v5.8h, v12.8h\n"
- "fmla v19.8h, v5.8h, v11.8h\n"
- "fmla v24.8h, v2.8h, v11.8h\n"
- "fmla v26.8h, v7.8h, v22.8h\n"
- "fmla v18.8h, v6.8h, v22.8h\n"
- "fmla v31.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v9.8h\n"
+ "fmla v13.8h, v6.8h, v9.8h\n"
+ "fmla v16.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x26, x15]\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v11.8h\n"
+ "ldr q12, [x27, x15]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
+ "fmla v24.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x15]\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v9.8h\n"
+ "fmla v23.8h, v6.8h, v9.8h\n"
+ "fmla v22.8h, v8.8h, v12.8h\n"
"ldr q12, [x24, x15]\n"
- "fmla v29.8h, v8.8h, v22.8h\n"
- "ldr q22, [x23, x15]\n"
- "fmla v28.8h, v8.8h, v12.8h\n"
- "fmla v20.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
- "fmla v26.8h, v5.8h, v12.8h\n"
- "fmla v18.8h, v4.8h, v12.8h\n"
- "fmla v24.8h, v3.8h, v12.8h\n"
- "ldr q12, [x20, x15]\n"
- "ldp x20, x24, [x16, #0x0]\n"
- "ldr q9, [x20, x6]\n"
- "fmla v21.8h, v8.8h, v11.8h\n"
- "ldr q11, [x25, x15]\n"
- "fmla v17.8h, v4.8h, v22.8h\n"
- "fmla v16.8h, v3.8h, v22.8h\n"
- "fmla v15.8h, v5.8h, v12.8h\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmla v10.8h, v4.8h, v12.8h\n"
- "fmla v26.8h, v8.8h, v11.8h\n"
- "fmax v16.8h, v16.8h, v13.8h\n"
- "fmla v18.8h, v7.8h, v11.8h\n"
- "fmla v24.8h, v6.8h, v11.8h\n"
- "ldr q11, [x22, x15]\n"
- "fmax v15.8h, v15.8h, v13.8h\n"
- "fmla v27.8h, v1.8h, v22.8h\n"
- "fmla v23.8h, v0.8h, v22.8h\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v24.8h, v8.8h, v9.8h\n"
+ "ldr q9, [x22, x15]\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v7.8h, v10.8h\n"
+ "fmla v19.8h, v6.8h, v10.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v4.8h, v10.8h\n"
+ "ldr q11, [x20, x15]\n"
+ "fmla v26.8h, v4.8h, v9.8h\n"
+ "ldp x20, x22, [x16, #0x0]\n"
+ "fmla v18.8h, v3.8h, v9.8h\n"
+ "fmla v13.8h, v1.8h, v9.8h\n"
+ "fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q10, [x21, x15]\n"
"ldr q0, [x17, #0x10]\n"
- "fmla v25.8h, v2.8h, v12.8h\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v20.8h, v5.8h, v11.8h\n"
+ "fmla v25.8h, v4.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v17.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x23, x15]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
"ldr q2, [x17, #0x30]\n"
- "fmla v21.8h, v1.8h, v12.8h\n"
+ "fmla v22.8h, v1.8h, v11.8h\n"
"ldr q1, [x17, #0x20]\n"
- "fmax v10.8h, v10.8h, v13.8h\n"
- "fmla v31.8h, v7.8h, v11.8h\n"
- "fmla v28.8h, v6.8h, v11.8h\n"
- "ldr q6, [x17, #0x70]\n"
- "fmla v20.8h, v8.8h, v22.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
"ldr q8, [x17, #0x90]\n"
- "fmla v19.8h, v7.8h, v22.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmla v19.8h, v7.8h, v10.8h\n"
+ "fmla v16.8h, v7.8h, v12.8h\n"
"ldr q7, [x17, #0x80]\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "fmin v16.8h, v16.8h, v14.8h\n"
- "str q17, [x12, x14]\n"
- "ldr x23, [x8, #0x20]\n"
- "fmin v15.8h, v15.8h, v14.8h\n"
- "fmin v10.8h, v10.8h, v14.8h\n"
- "str q16, [x11, x14]\n"
- "ldr x22, [x8, #0x28]\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v13.8h\n"
- "str q15, [x10, x14]\n"
- "ldr x21, [x8, #0x30]\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
- "fmax v21.8h, v21.8h, v13.8h\n"
- "str q10, [x9, x14]\n"
- "ldr x20, [x8, #0x38]\n"
- "fmla v29.8h, v4.8h, v11.8h\n"
- "fmla v26.8h, v3.8h, v11.8h\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
"ldr q3, [x17, #0x40]\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmla v18.8h, v5.8h, v22.8h\n"
+ "fmax v13.8h, v13.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmla v23.8h, v5.8h, v10.8h\n"
"ldr q5, [x17, #0x60]\n"
- "fmla v24.8h, v4.8h, v22.8h\n"
- "ldr q10, [x24, x6]\n"
+ "ldr q11, [x21, x6]\n"
+ "ldr q12, [x20, x6]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q26, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmla v17.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x22, x6]\n"
"ldr q4, [x17, #0x50]\n"
- "fmin v23.8h, v23.8h, v14.8h\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
- "str q27, [x23, x14]\n"
+ "str q18, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmin v13.8h, v13.8h, v14.8h\n"
+ "str q20, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
"fmin v21.8h, v21.8h, v14.8h\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
- "str q23, [x22, x14]\n"
- "ldr x25, [x8, #0x40]\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmax v20.8h, v20.8h, v13.8h\n"
- "str q25, [x21, x14]\n"
- "ldr x23, [x8, #0x48]\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
- "str q21, [x20, x14]\n"
- "ldr x22, [x8, #0x50]\n"
- "ldr x24, [x8, #0x58]\n"
- "ldp x21, x20, [x16, #0x10]\n"
- "ldr q11, [x21, x6]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q25, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "str q13, [x23, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "str q21, [x22, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "str q27, [x21, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "str q22, [x20, x14]\n"
+ "ldr x20, [x8, #0x58]\n"
"fmin v31.8h, v31.8h, v14.8h\n"
"fmin v28.8h, v28.8h, v14.8h\n"
- "ldr q12, [x20, x6]\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
"fmin v19.8h, v19.8h, v14.8h\n"
- "str q31, [x25, x14]\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "str q28, [x23, x14]\n"
- "ldr x23, [x8, #0x60]\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v13.8h\n"
- "str q20, [x22, x14]\n"
- "ldr x22, [x8, #0x68]\n"
- "str q19, [x24, x14]\n"
- "ldr x21, [x8, #0x70]\n"
- "ldr x20, [x8, #0x78]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
"add x6, x6, #0x10\n"
- "cmp x6, x7, LSL #4\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
"add x15, x15, #0x10\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q16, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "cmp x6, x7, LSL #4\n"
+ "str q31, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
"fmin v24.8h, v24.8h, v14.8h\n"
- "str q29, [x23, x14]\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q28, [x21, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
"add x17, x17, #0xa0\n"
- "str q26, [x22, x14]\n"
- "str q18, [x21, x14]\n"
- "str q24, [x20, x14]\n"
+ "str q19, [x20, x14]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q24, [x23, x14]\n"
+ "str q29, [x22, x14]\n"
+ "str q23, [x21, x14]\n"
+ "str q17, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
- "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v8.8h, v9.8h\n"
"ldr x27, [x16, #0x20]\n"
"ldr x24, [x16, #0x30]\n"
- "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
- "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v13.16b, v30.16b\n fmla v13.8h, v3.8h, v9.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v1.8h, v9.8h\n"
"ldr x23, [x16, #0x28]\n"
"ldr x22, [x16, #0x38]\n"
- "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
- "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v7.8h, v9.8h\n"
"ldr x26, [x16, #0x40]\n"
"ldr x21, [x16, #0x48]\n"
- "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
- "fmla v31.8h, v5.8h, v12.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v5.8h, v9.8h\n"
"ldr x25, [x16, #0x50]\n"
"ldr x20, [x16, #0x58]\n"
- "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
- "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
- "ldr q24, [x24, x15]\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q22, [x24, x15]\n"
"ldr x13, [x16, #0x70]\n"
- "fmla v17.8h, v0.8h, v10.8h\n"
- "ldr q22, [x27, x15]\n"
- "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
- "ldr q16, [x23, x15]\n"
- "fmla v15.8h, v4.8h, v12.8h\n"
- "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x27, x15]\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q23, [x23, x15]\n"
+ "fmla v13.8h, v4.8h, v12.8h\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
"ldr x24, [x16, #0x60]\n"
"ldr x23, [x16, #0x68]\n"
- "fmla v19.8h, v1.8h, v12.8h\n"
- "fmla v20.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v1.8h, v12.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
"ldr x12, [x8, #0x0]\n"
"ldr x11, [x8, #0x8]\n"
- "fmla v21.8h, v7.8h, v12.8h\n"
- "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v25.8h, v7.8h, v12.8h\n"
+ "mov v11.16b, v30.16b\n fmla v11.8h, v6.8h, v20.8h\n"
+ "ldr q9, [x21, x15]\n"
"ldr x28, [x16, #0x88]\n"
- "fmla v31.8h, v7.8h, v24.8h\n"
- "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v16.8h, v7.8h, v22.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
"ldr x10, [x8, #0x10]\n"
"ldr x9, [x8, #0x18]\n"
- "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
- "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
- "ldr q23, [x22, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v3.8h, v12.8h\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v12.8h\n"
+ "ldr q21, [x22, x15]\n"
"ldr x22, [x16, #0x78]\n"
- "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
- "ldr q16, [x26, x15]\n"
- "fmla v15.8h, v6.8h, v24.8h\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v23.8h\n"
+ "ldr q23, [x26, x15]\n"
+ "fmla v13.8h, v6.8h, v22.8h\n"
"ldr x21, [x16, #0x80]\n"
- "fmla v29.8h, v4.8h, v24.8h\n"
- "fmla v19.8h, v3.8h, v24.8h\n"
+ "fmla v31.8h, v4.8h, v22.8h\n"
+ "fmla v17.8h, v3.8h, v22.8h\n"
"add x14, x14, #0x10\n"
- "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
- "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
- "fmla v18.8h, v8.8h, v24.8h\n"
- "fmla v27.8h, v5.8h, v24.8h\n"
- "fmla v10.8h, v2.8h, v24.8h\n"
- "ldr q24, [x25, x15]\n"
- "fmla v17.8h, v1.8h, v23.8h\n"
+ "mov v12.16b, v30.16b\n fmla v12.8h, v1.8h, v22.8h\n"
+ "fmla v30.8h, v0.8h, v22.8h\n"
+ "fmla v28.8h, v8.8h, v22.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x25, x15]\n"
+ "fmla v19.8h, v1.8h, v21.8h\n"
"ldr x27, [x16, #0x90]\n"
- "fmla v20.8h, v0.8h, v23.8h\n"
- "ldr q23, [x20, x15]\n"
- "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v18.8h, v0.8h, v21.8h\n"
+ "ldr q21, [x20, x15]\n"
+ "fmla v25.8h, v2.8h, v23.8h\n"
"ldr x20, [x16, #0x98]\n"
- "fmla v31.8h, v8.8h, v22.8h\n"
- "fmla v28.8h, v1.8h, v16.8h\n"
- "ldr q16, [x24, x15]\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v27.8h, v1.8h, v23.8h\n"
+ "ldr q20, [x24, x15]\n"
"ldr x26, [x16, #0xa0]\n"
- "fmla v15.8h, v7.8h, v22.8h\n"
- "fmla v9.8h, v6.8h, v22.8h\n"
- "fmla v29.8h, v5.8h, v22.8h\n"
- "fmla v19.8h, v4.8h, v22.8h\n"
- "fmla v11.8h, v3.8h, v22.8h\n"
- "fmla v26.8h, v2.8h, v22.8h\n"
- "fmla v25.8h, v1.8h, v22.8h\n"
- "fmla v12.8h, v0.8h, v22.8h\n"
- "ldr q22, [x23, x15]\n"
+ "fmla v13.8h, v7.8h, v9.8h\n"
+ "fmla v10.8h, v6.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v9.8h\n"
+ "fmla v17.8h, v4.8h, v9.8h\n"
+ "fmla v26.8h, v3.8h, v9.8h\n"
+ "fmla v12.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v24.8h, v0.8h, v9.8h\n"
+ "ldr q23, [x23, x15]\n"
"ldr x25, [x16, #0xa8]\n"
- "fmla v17.8h, v3.8h, v24.8h\n"
- "fmla v18.8h, v0.8h, v24.8h\n"
- "fmla v27.8h, v6.8h, v16.8h\n"
- "fmla v10.8h, v3.8h, v16.8h\n"
- "ldr q16, [x13, x15]\n"
+ "fmla v19.8h, v3.8h, v22.8h\n"
+ "fmla v28.8h, v0.8h, v22.8h\n"
+ "fmla v29.8h, v6.8h, v20.8h\n"
+ "fmla v11.8h, v3.8h, v20.8h\n"
+ "ldr q20, [x13, x15]\n"
"ldr x24, [x16, #0xb0]\n"
- "fmla v20.8h, v4.8h, v22.8h\n"
- "fmla v21.8h, v3.8h, v22.8h\n"
- "fmla v31.8h, v1.8h, v22.8h\n"
- "fmla v28.8h, v5.8h, v23.8h\n"
- "fmla v9.8h, v2.8h, v23.8h\n"
- "ldr q23, [x22, x15]\n"
- "fmla v15.8h, v0.8h, v22.8h\n"
+ "fmla v18.8h, v4.8h, v23.8h\n"
+ "fmla v25.8h, v3.8h, v23.8h\n"
+ "fmla v16.8h, v1.8h, v23.8h\n"
+ "fmla v27.8h, v5.8h, v21.8h\n"
+ "fmla v10.8h, v2.8h, v21.8h\n"
+ "ldr q22, [x22, x15]\n"
+ "fmla v13.8h, v0.8h, v23.8h\n"
"ldr x23, [x16, #0xb8]\n"
- "fmla v11.8h, v8.8h, v16.8h\n"
- "fmla v12.8h, v5.8h, v16.8h\n"
- "ldr q16, [x21, x15]\n"
+ "fmla v26.8h, v8.8h, v20.8h\n"
+ "fmla v24.8h, v5.8h, v20.8h\n"
+ "ldr q21, [x21, x15]\n"
"ldr x22, [x16, #0xc0]\n"
- "fmla v17.8h, v5.8h, v22.8h\n"
- "fmla v18.8h, v2.8h, v22.8h\n"
- "ldr q22, [x28, x15]\n"
+ "fmla v19.8h, v5.8h, v23.8h\n"
+ "fmla v28.8h, v2.8h, v23.8h\n"
+ "ldr q20, [x28, x15]\n"
"ldr x21, [x16, #0xc8]\n"
- "fmla v20.8h, v5.8h, v23.8h\n"
- "fmla v21.8h, v4.8h, v23.8h\n"
- "fmla v31.8h, v2.8h, v23.8h\n"
- "fmla v28.8h, v3.8h, v23.8h\n"
- "fmla v15.8h, v1.8h, v23.8h\n"
- "fmla v9.8h, v0.8h, v23.8h\n"
- "ldr q23, [x20, x15]\n"
+ "fmla v18.8h, v5.8h, v22.8h\n"
+ "fmla v25.8h, v4.8h, v22.8h\n"
+ "fmla v16.8h, v2.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v13.8h, v1.8h, v22.8h\n"
+ "fmla v10.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
"ldr x28, [x16, #0xd8]\n"
- "fmla v10.8h, v7.8h, v16.8h\n"
- "fmla v26.8h, v6.8h, v16.8h\n"
- "ldr q16, [x27, x15]\n"
+ "fmla v11.8h, v7.8h, v21.8h\n"
+ "fmla v12.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x27, x15]\n"
"ldr x20, [x16, #0xd0]\n"
- "fmla v17.8h, v7.8h, v22.8h\n"
- "fmla v20.8h, v6.8h, v22.8h\n"
- "fmla v18.8h, v4.8h, v22.8h\n"
- "fmla v31.8h, v3.8h, v22.8h\n"
- "fmla v27.8h, v1.8h, v22.8h\n"
- "fmla v29.8h, v0.8h, v22.8h\n"
- "ldr q22, [x26, x15]\n"
+ "fmla v19.8h, v7.8h, v20.8h\n"
+ "fmla v18.8h, v6.8h, v20.8h\n"
+ "fmla v28.8h, v4.8h, v20.8h\n"
+ "fmla v16.8h, v3.8h, v20.8h\n"
+ "fmla v29.8h, v1.8h, v20.8h\n"
+ "fmla v31.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x26, x15]\n"
"ldr x27, [x16, #0xe0]\n"
- "fmla v21.8h, v8.8h, v23.8h\n"
- "fmla v25.8h, v8.8h, v16.8h\n"
- "fmla v12.8h, v7.8h, v16.8h\n"
- "ldr q16, [x25, x15]\n"
- "fmla v11.8h, v1.8h, v23.8h\n"
+ "fmla v25.8h, v8.8h, v22.8h\n"
+ "fmla v30.8h, v8.8h, v21.8h\n"
+ "fmla v24.8h, v7.8h, v21.8h\n"
+ "ldr q21, [x25, x15]\n"
+ "fmla v26.8h, v1.8h, v22.8h\n"
"ldr x26, [x16, #0xe8]\n"
- "fmla v28.8h, v7.8h, v23.8h\n"
- "fmla v15.8h, v5.8h, v23.8h\n"
- "fmla v9.8h, v4.8h, v23.8h\n"
- "fmla v19.8h, v2.8h, v23.8h\n"
- "ldr q23, [x24, x15]\n"
- "ldr x25, [x16, #0xf0]\n"
+ "fmla v27.8h, v7.8h, v22.8h\n"
+ "fmla v13.8h, v5.8h, v22.8h\n"
+ "fmla v10.8h, v4.8h, v22.8h\n"
"fmla v17.8h, v2.8h, v22.8h\n"
- "fmla v20.8h, v1.8h, v22.8h\n"
- "fmla v21.8h, v0.8h, v22.8h\n"
- "ldr q22, [x23, x15]\n"
- "fmla v18.8h, v7.8h, v16.8h\n"
+ "ldr q22, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v19.8h, v2.8h, v20.8h\n"
+ "fmla v18.8h, v1.8h, v20.8h\n"
+ "fmla v25.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x23, x15]\n"
+ "fmla v28.8h, v7.8h, v21.8h\n"
"ldr x24, [x16, #0xf8]\n"
- "fmla v31.8h, v6.8h, v16.8h\n"
- "fmla v27.8h, v4.8h, v16.8h\n"
- "fmla v29.8h, v3.8h, v16.8h\n"
- "fmla v10.8h, v1.8h, v16.8h\n"
- "fmla v26.8h, v0.8h, v16.8h\n"
- "ldr q16, [x22, x15]\n"
- "fmla v11.8h, v4.8h, v16.8h\n"
+ "fmla v16.8h, v6.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmla v31.8h, v3.8h, v21.8h\n"
+ "fmla v11.8h, v1.8h, v21.8h\n"
+ "fmla v12.8h, v0.8h, v21.8h\n"
+ "ldr q21, [x22, x15]\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
"ldr x23, [x16, #0x100]\n"
- "fmla v25.8h, v2.8h, v16.8h\n"
- "fmla v20.8h, v2.8h, v23.8h\n"
- "fmla v21.8h, v1.8h, v23.8h\n"
- "fmla v28.8h, v0.8h, v23.8h\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v22.8h\n"
"ldr q23, [x21, x15]\n"
"ldr x22, [x16, #0x108]\n"
- "fmla v17.8h, v6.8h, v22.8h\n"
- "fmla v18.8h, v3.8h, v22.8h\n"
- "fmla v27.8h, v0.8h, v22.8h\n"
- "ldr q22, [x20, x15]\n"
- "fmla v15.8h, v8.8h, v16.8h\n"
+ "fmla v19.8h, v6.8h, v20.8h\n"
+ "fmla v28.8h, v3.8h, v20.8h\n"
+ "fmla v26.8h, v4.8h, v21.8h\n"
+ "fmla v30.8h, v2.8h, v21.8h\n"
+ "fmla v29.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x20, x15]\n"
+ "fmla v13.8h, v8.8h, v21.8h\n"
"ldr x21, [x16, #0x110]\n"
- "fmla v9.8h, v7.8h, v16.8h\n"
- "fmla v19.8h, v5.8h, v16.8h\n"
- "fmla v12.8h, v1.8h, v16.8h\n"
- "ldr q16, [x28, x15]\n"
- "fmla v11.8h, v2.8h, v23.8h\n"
+ "fmla v10.8h, v7.8h, v21.8h\n"
+ "fmla v17.8h, v5.8h, v21.8h\n"
+ "fmla v24.8h, v1.8h, v21.8h\n"
+ "ldr q21, [x28, x15]\n"
+ "fmla v27.8h, v8.8h, v23.8h\n"
"ldr x20, [x16, #0x118]\n"
- "fmla v10.8h, v0.8h, v22.8h\n"
- "fmla v26.8h, v4.8h, v16.8h\n"
- "fmla v25.8h, v3.8h, v16.8h\n"
- "fmla v28.8h, v8.8h, v23.8h\n"
- "fmla v9.8h, v5.8h, v23.8h\n"
- "ldr q23, [x27, x15]\n"
- "fmla v18.8h, v6.8h, v22.8h\n"
- "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v23.8h\n"
+ "fmla v11.8h, v0.8h, v20.8h\n"
+ "fmla v28.8h, v6.8h, v20.8h\n"
+ "fmla v29.8h, v3.8h, v20.8h\n"
"ldr q22, [x26, x15]\n"
- "fmla v29.8h, v7.8h, v16.8h\n"
- "fmla v19.8h, v6.8h, v16.8h\n"
- "fmla v10.8h, v5.8h, v16.8h\n"
- "fmla v11.8h, v5.8h, v23.8h\n"
- "fmla v12.8h, v2.8h, v23.8h\n"
- "fmla v26.8h, v7.8h, v22.8h\n"
- "fmla v25.8h, v6.8h, v22.8h\n"
- "fmla v27.8h, v8.8h, v16.8h\n"
- "ldr q16, [x25, x15]\n"
- "fmla v10.8h, v8.8h, v22.8h\n"
- "ldr q30, [x23, x15]\n"
- "fmla v29.8h, v8.8h, v16.8h\n"
- "fmla v19.8h, v7.8h, v16.8h\n"
- "fmla v11.8h, v6.8h, v16.8h\n"
- "fmla v26.8h, v5.8h, v16.8h\n"
- "fmla v25.8h, v4.8h, v16.8h\n"
- "fmla v12.8h, v3.8h, v16.8h\n"
- "ldr q24, [x22, x15]\n"
- "fmla v9.8h, v8.8h, v23.8h\n"
- "ldr q16, [x24, x15]\n"
- "fmla v17.8h, v4.8h, v30.8h\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmla v20.8h, v3.8h, v30.8h\n"
- "fmla v21.8h, v5.8h, v24.8h\n"
- "fmax v20.8h, v20.8h, v13.8h\n"
- "fmla v28.8h, v4.8h, v24.8h\n"
- "fmla v26.8h, v8.8h, v16.8h\n"
- "fmax v21.8h, v21.8h, v13.8h\n"
- "fmla v25.8h, v7.8h, v16.8h\n"
- "fmla v12.8h, v6.8h, v16.8h\n"
- "ldr q23, [x21, x15]\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmla v18.8h, v1.8h, v30.8h\n"
- "fmla v31.8h, v0.8h, v30.8h\n"
- "ldr q16, [x20, x15]\n"
- "fmin v17.8h, v17.8h, v14.8h\n"
- "fmla v15.8h, v2.8h, v24.8h\n"
- "fmla v9.8h, v1.8h, v24.8h\n"
- "fmin v20.8h, v20.8h, v14.8h\n"
- "str q17, [x12, x14]\n"
- "fmla v27.8h, v7.8h, v23.8h\n"
- "fmla v29.8h, v6.8h, v23.8h\n"
- "fmin v21.8h, v21.8h, v14.8h\n"
- "str q20, [x11, x14]\n"
- "fmla v19.8h, v8.8h, v16.8h\n"
- "fmla v11.8h, v7.8h, v16.8h\n"
- "fmin v28.8h, v28.8h, v14.8h\n"
- "str q21, [x10, x14]\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
- "str q28, [x9, x14]\n"
+ "fmla v12.8h, v4.8h, v21.8h\n"
+ "fmla v30.8h, v3.8h, v21.8h\n"
+ "fmla v10.8h, v5.8h, v23.8h\n"
+ "ldr q20, [x27, x15]\n"
+ "fmla v31.8h, v7.8h, v21.8h\n"
+ "fmla v17.8h, v6.8h, v21.8h\n"
+ "fmla v11.8h, v5.8h, v21.8h\n"
+ "fmla v29.8h, v8.8h, v21.8h\n"
+ "ldr q21, [x25, x15]\n"
+ "fmla v26.8h, v5.8h, v20.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v12.8h, v7.8h, v22.8h\n"
+ "fmla v30.8h, v6.8h, v22.8h\n"
+ "fmla v10.8h, v8.8h, v20.8h\n"
+ "ldr q20, [x24, x15]\n"
+ "fmla v11.8h, v8.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v31.8h, v8.8h, v21.8h\n"
+ "fmla v17.8h, v7.8h, v21.8h\n"
+ "fmla v26.8h, v6.8h, v21.8h\n"
+ "fmla v12.8h, v5.8h, v21.8h\n"
+ "fmla v24.8h, v3.8h, v21.8h\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
+ "ldr q21, [x22, x15]\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v18.8h, v3.8h, v22.8h\n"
+ "fmla v28.8h, v1.8h, v22.8h\n"
+ "fmla v16.8h, v0.8h, v22.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v25.8h, v5.8h, v21.8h\n"
+ "fmla v27.8h, v4.8h, v21.8h\n"
+ "fmla v12.8h, v8.8h, v20.8h\n"
+ "fmla v30.8h, v7.8h, v20.8h\n"
+ "fmla v24.8h, v6.8h, v20.8h\n"
+ "ldr q0, [x21, x15]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmla v13.8h, v2.8h, v21.8h\n"
+ "fmla v10.8h, v1.8h, v21.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmla v17.8h, v8.8h, v23.8h\n"
+ "fmla v26.8h, v7.8h, v23.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmla v29.8h, v7.8h, v0.8h\n"
+ "fmla v31.8h, v6.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmla v11.8h, v4.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v12.8h, v3.8h, v0.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmla v30.8h, v5.8h, v23.8h\n"
+ "fmax v13.8h, v13.8h, v15.8h\n"
+ "fmax v10.8h, v10.8h, v15.8h\n"
+ "str q19, [x12, x14]\n"
"ldr x23, [x8, #0x20]\n"
- "fmax v15.8h, v15.8h, v13.8h\n"
- "fmax v9.8h, v9.8h, v13.8h\n"
+ "str q18, [x11, x14]\n"
"ldr x22, [x8, #0x28]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q25, [x10, x14]\n"
"ldr x21, [x8, #0x30]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "str q27, [x9, x14]\n"
"ldr x20, [x8, #0x38]\n"
- "fmla v10.8h, v4.8h, v23.8h\n"
- "fmla v26.8h, v3.8h, v23.8h\n"
- "fmin v18.8h, v18.8h, v14.8h\n"
- "fmla v25.8h, v5.8h, v16.8h\n"
- "fmla v12.8h, v4.8h, v16.8h\n"
- "fmin v31.8h, v31.8h, v14.8h\n"
- "str q18, [x23, x14]\n"
- "fmin v15.8h, v15.8h, v14.8h\n"
- "fmin v9.8h, v9.8h, v14.8h\n"
- "str q31, [x22, x14]\n"
+ "fmin v13.8h, v13.8h, v14.8h\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q28, [x23, x14]\n"
"ldr x23, [x8, #0x40]\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
- "str q15, [x21, x14]\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "str q16, [x22, x14]\n"
"ldr x22, [x8, #0x48]\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
- "fmax v11.8h, v11.8h, v13.8h\n"
- "str q9, [x20, x14]\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q13, [x21, x14]\n"
"ldr x21, [x8, #0x50]\n"
+ "fmax v11.8h, v11.8h, v15.8h\n"
+ "fmax v12.8h, v12.8h, v15.8h\n"
+ "str q10, [x20, x14]\n"
"ldr x20, [x8, #0x58]\n"
- "fmin v27.8h, v27.8h, v14.8h\n"
- "fmin v29.8h, v29.8h, v14.8h\n"
- "str q27, [x23, x14]\n"
- "fmin v19.8h, v19.8h, v14.8h\n"
- "fmin v11.8h, v11.8h, v14.8h\n"
- "str q29, [x22, x14]\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "str q29, [x23, x14]\n"
"ldr x23, [x8, #0x60]\n"
- "fmax v10.8h, v10.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "str q19, [x21, x14]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmin v11.8h, v11.8h, v14.8h\n"
+ "str q31, [x22, x14]\n"
"ldr x22, [x8, #0x68]\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
- "fmax v12.8h, v12.8h, v13.8h\n"
- "str q11, [x20, x14]\n"
+ "str q17, [x21, x14]\n"
"ldr x21, [x8, #0x70]\n"
- "ldr x20, [x8, #0x78]\n"
- "fmin v10.8h, v10.8h, v14.8h\n"
- "fmin v26.8h, v26.8h, v14.8h\n"
- "str q10, [x23, x14]\n"
- "fmin v25.8h, v25.8h, v14.8h\n"
"fmin v12.8h, v12.8h, v14.8h\n"
- "str q26, [x22, x14]\n"
- "add x15, x15, #0x10\n"
- "str q25, [x21, x14]\n"
- "str q12, [x20, x14]\n"
+ "str q26, [x20, x14]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q11, [x23, x14]\n"
+ "str q12, [x22, x14]\n"
+ "str q30, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 140f\n"
@@ -715,10 +715,10 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q8, [x17, #0x90]\n"
"ldr x23, [x16, #0x0]\n"
"ldr x22, [x16, #0x8]\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
"ldr x21, [x16, #0x10]\n"
"ldr x20, [x16, #0x18]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
"add x21, x21, x15\n"
"add x20, x20, x15\n"
"tbz %x[n_channels], #2, 5f\n"
@@ -765,20 +765,20 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
"mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
"ldr x20, [x16, #0x20]\n"
- "add x20, x20, x15\n"
"mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
"mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
"mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
"mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
"mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
"mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
"mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
"fmla v16.8h, v0.8h, v10.8h\n"
"fmla v17.8h, v8.8h, v12.8h\n"
"fmla v18.8h, v7.8h, v12.8h\n"
- "fmla v19.8h, v6.8h, v12.8h\n"
"fmla v21.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
"fmla v22.8h, v4.8h, v12.8h\n"
"mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
"fmla v25.8h, v2.8h, v12.8h\n"
@@ -853,13 +853,13 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x38]\n"
"fmla v20.8h, v8.8h, v9.8h\n"
"fmla v21.8h, v7.8h, v9.8h\n"
- "add x20, x20, x15\n"
"fmla v22.8h, v6.8h, v9.8h\n"
"fmla v24.8h, v5.8h, v9.8h\n"
"fmla v25.8h, v4.8h, v9.8h\n"
"fmla v26.8h, v3.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v9.8h\n"
"mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "add x20, x20, x15\n"
"fmla v30.8h, v0.8h, v9.8h\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
@@ -932,13 +932,13 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x50]\n"
"fmla v21.8h, v8.8h, v10.8h\n"
"fmla v22.8h, v7.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v23.8h, v6.8h, v10.8h\n"
"fmla v25.8h, v5.8h, v10.8h\n"
"fmla v26.8h, v4.8h, v10.8h\n"
"fmla v27.8h, v3.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"fmla v31.8h, v0.8h, v10.8h\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
@@ -1035,11 +1035,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x70]\n"
"fmla v16.8h, v5.8h, v10.8h\n"
"fmla v17.8h, v4.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v18.8h, v3.8h, v10.8h\n"
"fmla v20.8h, v2.8h, v10.8h\n"
"fmla v21.8h, v1.8h, v10.8h\n"
"fmla v22.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -1087,11 +1087,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x80]\n"
"fmla v17.8h, v5.8h, v12.8h\n"
"fmla v18.8h, v4.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v19.8h, v3.8h, v12.8h\n"
"fmla v21.8h, v2.8h, v12.8h\n"
"fmla v22.8h, v1.8h, v12.8h\n"
"fmla v23.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 57f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
@@ -1139,11 +1139,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x90]\n"
"fmla v16.8h, v7.8h, v10.8h\n"
"fmla v17.8h, v6.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v20.8h, v4.8h, v10.8h\n"
"fmla v21.8h, v3.8h, v10.8h\n"
"fmla v24.8h, v1.8h, v10.8h\n"
"fmla v25.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 65f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -1191,11 +1191,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xa0]\n"
"fmla v18.8h, v8.8h, v12.8h\n"
"fmla v19.8h, v7.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v22.8h, v5.8h, v12.8h\n"
"fmla v23.8h, v4.8h, v12.8h\n"
"fmla v26.8h, v2.8h, v12.8h\n"
"fmla v27.8h, v1.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
@@ -1219,8 +1219,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xa8]\n"
"fmla v16.8h, v2.8h, v10.8h\n"
"fmla v17.8h, v1.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v18.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 77f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
@@ -1244,11 +1244,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xb0]\n"
"fmla v20.8h, v7.8h, v11.8h\n"
"fmla v21.8h, v6.8h, v11.8h\n"
- "add x20, x20, x15\n"
"fmla v24.8h, v4.8h, v11.8h\n"
"fmla v25.8h, v3.8h, v11.8h\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 81f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
@@ -1272,8 +1272,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xb8]\n"
"fmla v17.8h, v2.8h, v12.8h\n"
"fmla v18.8h, v1.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v19.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
@@ -1297,8 +1297,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xc0]\n"
"fmla v16.8h, v6.8h, v10.8h\n"
"fmla v20.8h, v3.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v24.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 89f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 88f\n"
@@ -1322,11 +1322,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xc8]\n"
"fmla v22.8h, v8.8h, v11.8h\n"
"fmla v23.8h, v7.8h, v11.8h\n"
- "add x20, x20, x15\n"
"fmla v26.8h, v5.8h, v11.8h\n"
"fmla v27.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v11.8h\n"
"fmla v31.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 93f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 92f\n"
@@ -1350,8 +1350,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xd0]\n"
"fmla v19.8h, v8.8h, v12.8h\n"
"fmla v23.8h, v5.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v27.8h, v2.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 97f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 96f\n"
@@ -1375,8 +1375,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xd8]\n"
"fmla v20.8h, v6.8h, v10.8h\n"
"fmla v24.8h, v3.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v28.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 101f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 100f\n"
@@ -1400,11 +1400,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xe0]\n"
"fmla v24.8h, v8.8h, v11.8h\n"
"fmla v25.8h, v7.8h, v11.8h\n"
- "add x20, x20, x15\n"
"fmla v26.8h, v6.8h, v11.8h\n"
"fmla v28.8h, v5.8h, v11.8h\n"
"fmla v29.8h, v4.8h, v11.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 105f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 104f\n"
@@ -1428,8 +1428,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xe8]\n"
"fmla v23.8h, v8.8h, v12.8h\n"
"fmla v27.8h, v5.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v31.8h, v2.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 109f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 108f\n"
@@ -1453,8 +1453,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xf0]\n"
"fmla v28.8h, v8.8h, v10.8h\n"
"fmla v29.8h, v7.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v30.8h, v6.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 113f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 112f\n"
@@ -1478,11 +1478,11 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xf8]\n"
"fmla v25.8h, v8.8h, v11.8h\n"
"fmla v26.8h, v7.8h, v11.8h\n"
- "add x20, x20, x15\n"
"fmla v27.8h, v6.8h, v11.8h\n"
"fmla v29.8h, v5.8h, v11.8h\n"
"fmla v30.8h, v4.8h, v11.8h\n"
"fmla v31.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 117f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 116f\n"
@@ -1506,8 +1506,8 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x100]\n"
"fmla v29.8h, v8.8h, v12.8h\n"
"fmla v30.8h, v7.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v31.8h, v6.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 121f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 120f\n"
@@ -1531,9 +1531,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x108]\n"
"fmla v16.8h, v4.8h, v10.8h\n"
"fmla v17.8h, v3.8h, v10.8h\n"
- "add x20, x20, x15\n"
"fmla v20.8h, v1.8h, v10.8h\n"
"fmla v21.8h, v0.8h, v10.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 125f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 124f\n"
@@ -1557,9 +1557,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x110]\n"
"fmla v18.8h, v5.8h, v11.8h\n"
"fmla v19.8h, v4.8h, v11.8h\n"
- "add x20, x20, x15\n"
"fmla v22.8h, v2.8h, v11.8h\n"
"fmla v23.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 129f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 128f\n"
@@ -1583,9 +1583,9 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x118]\n"
"fmla v24.8h, v7.8h, v12.8h\n"
"fmla v25.8h, v6.8h, v12.8h\n"
- "add x20, x20, x15\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #2, 133f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 132f\n"
@@ -1608,24 +1608,24 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"135:" // Oddments: Load input (4, 4): Bit 2: End
"fmla v26.8h, v8.8h, v10.8h\n"
"fmla v27.8h, v7.8h, v10.8h\n"
- "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
"fmla v30.8h, v5.8h, v10.8h\n"
"fmla v31.8h, v4.8h, v10.8h\n"
- "fmax v17.8h, v17.8h, v13.8h\n"
- "fmax v18.8h, v18.8h, v13.8h\n"
- "fmax v19.8h, v19.8h, v13.8h\n"
- "fmax v20.8h, v20.8h, v13.8h\n"
- "fmax v21.8h, v21.8h, v13.8h\n"
- "fmax v22.8h, v22.8h, v13.8h\n"
- "fmax v23.8h, v23.8h, v13.8h\n"
- "fmax v24.8h, v24.8h, v13.8h\n"
- "fmax v25.8h, v25.8h, v13.8h\n"
- "fmax v26.8h, v26.8h, v13.8h\n"
- "fmax v27.8h, v27.8h, v13.8h\n"
- "fmax v28.8h, v28.8h, v13.8h\n"
- "fmax v29.8h, v29.8h, v13.8h\n"
- "fmax v30.8h, v30.8h, v13.8h\n"
- "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
"fmin v16.8h, v16.8h, v14.8h\n"
"fmin v17.8h, v17.8h, v14.8h\n"
"fmin v18.8h, v18.8h, v14.8h\n"
@@ -1645,150 +1645,150 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"tbz %x[n_channels], #2, 137f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.d }[0], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.d }[0], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.d }[0], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.d }[0], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.d }[0], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.d }[0], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.d }[0], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.d }[0], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.d }[0], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
- "add x14, x14, #0x8\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.d }[0], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.d }[0], [x22]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 136f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.s }[2], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.s }[2], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.s }[2], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.s }[2], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.s }[2], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.s }[2], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.s }[2], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.s }[2], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.s }[2], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
- "add x14, x14, #0x4\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.s }[2], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.s }[2], [x22]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.h }[6], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.h }[6], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.h }[6], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.h }[6], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.h }[6], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.h }[6], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.h }[6], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.h }[6], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.h }[6], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.h }[6], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.h }[6], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.h }[6], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.h }[6], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.h }[6], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.h }[6], [x21]\n"
"st1 { v31.h }[6], [x20]\n"
"b 139f\n"
@@ -1796,50 +1796,50 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"tbz %x[n_channels], #0, 139f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.h }[4], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.h }[4], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.h }[4], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.h }[4], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.h }[4], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.h }[4], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.h }[4], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.h }[4], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.h }[4], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.h }[4], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.h }[4], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.h }[4], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.h }[4], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.h }[4], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.h }[4], [x21]\n"
"st1 { v31.h }[4], [x20]\n"
"b 139f\n"
@@ -1847,150 +1847,150 @@ void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"tbz %x[n_channels], #1, 138f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.s }[0], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.s }[0], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.s }[0], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.s }[0], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.s }[0], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.s }[0], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.s }[0], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.s }[0], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.s }[0], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
- "add x14, x14, #0x4\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.s }[0], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.s }[0], [x22]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 139f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.h }[2], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.h }[2], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.h }[2], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.h }[2], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.h }[2], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.h }[2], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.h }[2], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.h }[2], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.h }[2], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.h }[2], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.h }[2], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.h }[2], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.h }[2], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.h }[2], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.h }[2], [x21]\n"
"st1 { v31.h }[2], [x20]\n"
"b 139f\n"
"138:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.h }[0], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.h }[0], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.h }[0], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.h }[0], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.h }[0], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.h }[0], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.h }[0], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.h }[0], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.h }[0], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.h }[0], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.h }[0], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.h }[0], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.h }[0], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.h }[0], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.h }[0], [x21]\n"
"st1 { v31.h }[0], [x20]\n"
"139:" // Oddments: Store: Bit 2: End
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 8954999990..badc0ddf36 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,259 +87,259 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x23, #0x0\n"
- "mov x27, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x10, #0x0\n"
"1:" // Tile loop
- "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x26, #0x4\n"
- "mov x25, #0x2\n"
- "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x9, #0x4\n"
+ "mov x28, #0x2\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
"ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x6, x6, #0x1\n"
- "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
- "add x8, x8, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x16, x8, x24, LSL #1\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
- "lsr x22, %x[n_channels], #0x3\n"
- "add x14, x16, x24, LSL #1\n"
- "mul x20, x20, x25\n" // offset *= output_tile_size
- "add x13, x6, x6\n"
- "add x12, x14, x24, LSL #1\n"
- "add x11, x13, x6\n"
- "add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "lsr x24, %x[n_channels], #0x3\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v26.8h }, [x20]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v27.8h }, [x20]\n"
- "add x10, x12, x24, LSL #1\n"
- "add x9, x11, x6\n"
- "add x28, x17, x21, LSL #1\n"
+ "mul x22, x11, x27\n" // offset = tile_i * ld_input_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x26\n"
+ "mul x20, x11, x25\n" // offset = tile_i * ld_output_row
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x10, x6, x22\n" // offset += tile_j * ld_input_col
+ "lsl x6, x6, #0x1\n"
+ "madd x20, x10, x7, x20\n" // offset += tile_j * ld_output_col
"lsl x7, x7, #0x1\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q31, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldr q9, [x14, x13]\n"
+ "mul x22, x22, x9\n" // offset *= kernel_stride * output_size
+ "add x15, x6, x6\n"
+ "add x14, x15, x6\n"
+ "add x13, x14, x6\n"
+ "mul x20, x20, x28\n" // offset *= output_tile_size
+ "add x8, x8, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x12, x8, x27, LSL #1\n"
+ "add x11, x12, x27, LSL #1\n"
+ "add x10, x11, x27, LSL #1\n"
+ "add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x9, x10, x27, LSL #1\n"
+ "add x28, x17, x25, LSL #1\n"
+ "cbz x24, 4f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x26, x24, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x11, x15]\n"
"ld1 { v10.8h }, [x8]\n"
"ldr q11, [x8, x6]\n"
- "ldr q12, [x8, x11]\n"
- "ldr q13, [x8, x9]\n"
- "ld1 { v14.8h }, [x16]\n"
- "ldr q15, [x16, x6]\n"
- "ldr q16, [x8, x13]\n"
+ "ldr q12, [x8, x14]\n"
+ "ldr q13, [x8, x13]\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "ldr q15, [x12, x6]\n"
+ "ldr q16, [x8, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
"mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
- "add x23, x23, #0x10\n"
+ "add x26, x26, #0x10\n"
"add x8, x8, #0x10\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x16, #0x0]\n"
+ "cmp x26, x24, LSL #4\n"
+ "add x21, x21, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v29.8h, v0.8h, v10.8h\n"
"ld1 { v10.8h }, [x8]\n"
"fmla v28.8h, v1.8h, v12.8h\n"
- "ldr q21, [x16, x9]\n"
+ "ldr q21, [x12, x13]\n"
"fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q18, [x16, x11]\n"
+ "ldr q18, [x12, x14]\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q17, [x16, x13]\n"
+ "ldr q20, [x12, x15]\n"
+ "add x12, x12, #0x10\n"
"fmla v29.8h, v3.8h, v14.8h\n"
- "ld1 { v20.8h }, [x12]\n"
+ "ld1 { v17.8h }, [x10]\n"
"fmla v28.8h, v0.8h, v16.8h\n"
- "add x16, x16, #0x10\n"
"fmla v29.8h, v4.8h, v15.8h\n"
- "ld1 { v25.8h }, [x14]\n"
+ "ld1 { v23.8h }, [x11]\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr q19, [x10, x13]\n"
"fmla v28.8h, v4.8h, v18.8h\n"
- "ldr q19, [x12, x6]\n"
+ "ldr q17, [x10, x6]\n"
"fmla v29.8h, v2.8h, v16.8h\n"
- "ldr q18, [x14, x6]\n"
+ "ldr q22, [x11, x6]\n"
"fmla v28.8h, v5.8h, v21.8h\n"
- "ldr q24, [x14, x11]\n"
- "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
- "ldr q31, [x15, #0x0]\n"
- "cmp x23, x22, LSL #4\n"
- "fmla v29.8h, v5.8h, v17.8h\n"
- "fmla v28.8h, v3.8h, v17.8h\n"
- "ldr q17, [x12, x11]\n"
- "add x20, x20, #0x10\n"
- "fmla v23.8h, v3.8h, v20.8h\n"
- "ldr q16, [x12, x9]\n"
- "fmla v22.8h, v4.8h, v17.8h\n"
- "ldr q21, [x10, x6]\n"
- "fmla v23.8h, v0.8h, v25.8h\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v22.8h, v1.8h, v24.8h\n"
- "add x21, x21, #0x10\n"
- "fmla v23.8h, v4.8h, v19.8h\n"
- "ldr q20, [x14, x9]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v22.8h, v5.8h, v16.8h\n"
- "ldr q19, [x10, x11]\n"
- "fmla v29.8h, v6.8h, v25.8h\n"
- "ld1 { v17.8h }, [x10]\n"
- "fmla v23.8h, v1.8h, v18.8h\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v22.8h, v2.8h, v20.8h\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v29.8h, v7.8h, v18.8h\n"
- "ldr q16, [x12, x13]\n"
- "fmla v23.8h, v6.8h, v17.8h\n"
- "ldr q18, [x10, x13]\n"
- "fmla v22.8h, v3.8h, v16.8h\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v23.8h, v7.8h, v21.8h\n"
- "ldr q13, [x8, x9]\n"
- "fmla v22.8h, v7.8h, v19.8h\n"
- "ld1 { v14.8h }, [x16]\n"
- "fmla v28.8h, v7.8h, v24.8h\n"
- "ldr q12, [x8, x11]\n"
- "fmla v23.8h, v5.8h, v16.8h\n"
- "ldr q16, [x8, x13]\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x11, x14]\n"
+ "fmla v25.8h, v0.8h, v23.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v29.8h, v5.8h, v20.8h\n"
+ "fmla v28.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x10, x14]\n"
+ "fmla v24.8h, v4.8h, v16.8h\n"
+ "ldr q21, [x9, x6]\n"
+ "fmla v25.8h, v4.8h, v17.8h\n"
+ "ldr q20, [x11, x13]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, #0x10\n"
+ "ldr q9, [x11, x15]\n"
+ "fmla v29.8h, v6.8h, v23.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v18.8h\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "ldr q12, [x8, x14]\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v24.8h, v5.8h, v19.8h\n"
+ "ldr q19, [x9, x14]\n"
+ "fmla v29.8h, v7.8h, v22.8h\n"
+ "ldr q16, [x10, x15]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ldr q18, [x9, x15]\n"
"fmla v28.8h, v8.8h, v20.8h\n"
- "ldr q17, [x10, x9]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v23.8h, v8.8h, v18.8h\n"
- "fmla v22.8h, v8.8h, v17.8h\n"
- "ldr q11, [x8, x6]\n"
- "ldr q15, [x16, x6]\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "ldr q17, [x9, x13]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x9, x9, #0x10\n"
"fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v25.8h, v7.8h, v21.8h\n"
+ "ldr q13, [x8, x13]\n"
"fmax v28.8h, v28.8h, v26.8h\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmax v23.8h, v23.8h, v26.8h\n"
- "fmax v22.8h, v22.8h, v26.8h\n"
- "add x14, x14, #0x10\n"
- "ldr q9, [x14, x13]\n"
+ "fmla v24.8h, v3.8h, v16.8h\n"
+ "ldr q3, [x16, #0x40]\n"
"fmin v29.8h, v29.8h, v27.8h\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x8, x15]\n"
+ "ldr q5, [x16, #0x60]\n"
"fmin v28.8h, v28.8h, v27.8h\n"
- "fmin v23.8h, v23.8h, v27.8h\n"
- "fmin v22.8h, v22.8h, v27.8h\n"
- "add x12, x12, #0x10\n"
- "add x10, x10, #0x10\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "ldr q7, [x16, #0x80]\n"
"st1 { v29.8h }, [x17]\n"
- "add x15, x15, #0xa0\n"
+ "fmla v25.8h, v8.8h, v18.8h\n"
"str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v23.8h }, [x28]\n"
- "str q22, [x28, x7]\n"
+ "fmla v24.8h, v6.8h, v18.8h\n"
+ "ldr q15, [x12, x6]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "st1 { v25.8h }, [x28]\n"
+ "str q24, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
- "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
"add x8, x8, #0x10\n"
- "fmla v29.8h, v0.8h, v10.8h\n"
- "fmla v28.8h, v1.8h, v12.8h\n"
- "ldr q20, [x16, x9]\n"
- "fmla v29.8h, v1.8h, v11.8h\n"
- "ldr q18, [x16, x11]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "ldr q17, [x16, x13]\n"
- "fmla v29.8h, v3.8h, v14.8h\n"
- "ld1 { v19.8h }, [x12]\n"
- "fmla v28.8h, v0.8h, v16.8h\n"
- "add x16, x16, #0x10\n"
- "fmla v29.8h, v4.8h, v15.8h\n"
- "ld1 { v25.8h }, [x14]\n"
- "fmla v28.8h, v4.8h, v18.8h\n"
- "ldr q18, [x12, x6]\n"
- "fmla v29.8h, v2.8h, v16.8h\n"
- "ldr q24, [x14, x6]\n"
- "fmla v28.8h, v5.8h, v20.8h\n"
- "ldr q23, [x14, x11]\n"
- "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
- "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
- "fmla v29.8h, v5.8h, v17.8h\n"
- "fmla v28.8h, v3.8h, v17.8h\n"
- "ldr q17, [x12, x11]\n"
- "fmla v22.8h, v3.8h, v19.8h\n"
- "ldr q16, [x12, x9]\n"
- "fmla v21.8h, v4.8h, v17.8h\n"
- "ldr q20, [x10, x6]\n"
- "fmla v22.8h, v0.8h, v25.8h\n"
- "fmla v21.8h, v1.8h, v23.8h\n"
- "fmla v22.8h, v4.8h, v18.8h\n"
- "ldr q19, [x14, x9]\n"
- "fmla v21.8h, v5.8h, v16.8h\n"
- "ldr q18, [x10, x11]\n"
- "fmla v29.8h, v6.8h, v25.8h\n"
- "ld1 { v17.8h }, [x10]\n"
- "fmla v22.8h, v1.8h, v24.8h\n"
- "add x14, x14, #0x10\n"
- "fmla v21.8h, v2.8h, v19.8h\n"
- "fmla v29.8h, v7.8h, v24.8h\n"
- "ldr q16, [x12, x13]\n"
- "fmax v29.8h, v29.8h, v26.8h\n"
- "fmla v22.8h, v6.8h, v17.8h\n"
- "ldr q17, [x10, x13]\n"
- "fmla v21.8h, v3.8h, v16.8h\n"
- "fmin v29.8h, v29.8h, v27.8h\n"
- "fmla v22.8h, v7.8h, v20.8h\n"
- "fmla v21.8h, v7.8h, v18.8h\n"
- "st1 { v29.8h }, [x17]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x12, x13]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x12, x14]\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x12, x15]\n"
"add x12, x12, #0x10\n"
- "fmla v28.8h, v7.8h, v23.8h\n"
- "fmla v22.8h, v5.8h, v16.8h\n"
- "fmla v21.8h, v6.8h, v17.8h\n"
- "fmla v28.8h, v8.8h, v19.8h\n"
- "ldr q16, [x10, x9]\n"
- "fmax v28.8h, v28.8h, v26.8h\n"
- "fmla v22.8h, v8.8h, v17.8h\n"
- "fmla v21.8h, v8.8h, v16.8h\n"
- "fmax v22.8h, v22.8h, v26.8h\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr q23, [x10, x13]\n"
+ "fmla v28.8h, v4.8h, v15.8h\n"
+ "ld1 { v22.8h }, [x11]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x10, x6]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "ldr q18, [x11, x6]\n"
+ "fmla v25.8h, v0.8h, v22.8h\n"
+ "fmla v29.8h, v5.8h, v21.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "fmla v29.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x10, x14]\n"
+ "fmla v24.8h, v4.8h, v16.8h\n"
+ "ldr q21, [x9, x6]\n"
+ "fmla v25.8h, v4.8h, v19.8h\n"
+ "ldr q20, [x11, x13]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v28.8h, v6.8h, v22.8h\n"
+ "ld1 { v16.8h }, [x9]\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "fmla v25.8h, v1.8h, v18.8h\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "ldr q19, [x10, x15]\n"
"add x10, x10, #0x10\n"
- "fmax v21.8h, v21.8h, v26.8h\n"
+ "fmla v29.8h, v8.8h, v20.8h\n"
+ "fmla v24.8h, v5.8h, v23.8h\n"
+ "ldr q18, [x9, x14]\n"
+ "fmla v25.8h, v6.8h, v16.8h\n"
+ "ldr q17, [x9, x15]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "ldr q16, [x9, x13]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.8h, v7.8h, v21.8h\n"
"fmin v28.8h, v28.8h, v27.8h\n"
- "str q28, [x17, x7]\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmla v24.8h, v3.8h, v19.8h\n"
+ "st1 { v28.8h }, [x17]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "str q29, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v22.8h, v22.8h, v27.8h\n"
- "fmin v21.8h, v21.8h, v27.8h\n"
- "st1 { v22.8h }, [x28]\n"
- "str q21, [x28, x7]\n"
+ "fmla v24.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
+ "st1 { v25.8h }, [x28]\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "str q24, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 81f\n"
- "ldr q31, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "add x27, x14, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x27, x11, x15\n"
"add x26, x8, XZR\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
"add x25, x8, x6\n"
- "add x24, x8, x11\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "add x23, x8, x9\n"
- "add x22, x16, XZR\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "add x21, x16, x6\n"
- "add x20, x8, x13\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
+ "add x24, x8, x14\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x23, x8, x13\n"
+ "add x22, x12, XZR\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "add x21, x12, x6\n"
+ "add x20, x8, x15\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"tbz %x[n_channels], #2, 6f\n"
"ldr d9, [x27], #0x8\n"
"ldr d10, [x26], #0x8\n"
@@ -410,18 +410,18 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h16, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
"mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "add x20, x16, x11\n"
"mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
+ "add x20, x12, x14\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 10f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -443,7 +443,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h11, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v29.8h, v4.8h, v11.8h\n"
- "add x20, x16, x9\n"
+ "add x20, x12, x13\n"
"tbz %x[n_channels], #2, 14f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 13f\n"
@@ -465,7 +465,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h12, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v29.8h, v5.8h, v12.8h\n"
- "add x20, x16, x13\n"
+ "add x20, x12, x15\n"
"tbz %x[n_channels], #2, 18f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
@@ -488,7 +488,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "add x20, x12, XZR\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #2, 22f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 21f\n"
@@ -510,7 +510,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h14, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v3.8h, v14.8h\n"
- "add x20, x14, XZR\n"
+ "add x20, x11, XZR\n"
"tbz %x[n_channels], #2, 26f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
@@ -533,7 +533,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
"fmla v28.8h, v6.8h, v15.8h\n"
"fmla v30.8h, v0.8h, v15.8h\n"
- "add x20, x12, x6\n"
+ "add x20, x10, x6\n"
"tbz %x[n_channels], #2, 30f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
@@ -555,7 +555,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h11, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
"fmla v30.8h, v4.8h, v11.8h\n"
- "add x20, x14, x6\n"
+ "add x20, x11, x6\n"
"tbz %x[n_channels], #2, 34f\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
@@ -578,7 +578,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
"fmla v28.8h, v7.8h, v16.8h\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "add x20, x12, x11\n"
+ "add x20, x10, x14\n"
"tbz %x[n_channels], #2, 38f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -600,7 +600,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h13, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
"fmla v31.8h, v4.8h, v13.8h\n"
- "add x20, x14, x11\n"
+ "add x20, x11, x14\n"
"tbz %x[n_channels], #2, 42f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -623,7 +623,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
"fmla v29.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "add x20, x12, x9\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #2, 46f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
@@ -645,7 +645,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h14, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
"fmla v31.8h, v5.8h, v14.8h\n"
- "add x20, x10, XZR\n"
+ "add x20, x9, XZR\n"
"tbz %x[n_channels], #2, 50f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
@@ -667,7 +667,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h15, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v30.8h, v6.8h, v15.8h\n"
- "add x20, x14, x9\n"
+ "add x20, x11, x13\n"
"tbz %x[n_channels], #2, 54f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
@@ -690,7 +690,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"56:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
"fmla v29.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "add x20, x10, x6\n"
+ "add x20, x9, x6\n"
"tbz %x[n_channels], #2, 58f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
@@ -712,7 +712,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h13, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
"fmla v30.8h, v7.8h, v13.8h\n"
- "add x20, x12, x13\n"
+ "add x20, x10, x15\n"
"tbz %x[n_channels], #2, 62f\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
@@ -735,7 +735,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"64:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
"fmla v30.8h, v5.8h, v16.8h\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "add x20, x10, x11\n"
+ "add x20, x9, x14\n"
"tbz %x[n_channels], #2, 66f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
@@ -757,7 +757,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr h14, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
"fmla v31.8h, v7.8h, v14.8h\n"
- "add x20, x10, x13\n"
+ "add x20, x9, x15\n"
"tbz %x[n_channels], #2, 70f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
@@ -780,7 +780,7 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"72:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v15.8h\n"
- "add x20, x10, x9\n"
+ "add x20, x9, x13\n"
"tbz %x[n_channels], #2, 74f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
@@ -805,27 +805,27 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmax v28.8h, v28.8h, v26.8h\n"
"fmax v29.8h, v29.8h, v26.8h\n"
"fmax v30.8h, v30.8h, v26.8h\n"
- "fmax v31.8h, v31.8h, v26.8h\n"
"fmin v28.8h, v28.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
"fmin v29.8h, v29.8h, v27.8h\n"
"fmin v30.8h, v30.8h, v27.8h\n"
"fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 78f\n"
"mov x21, x17\n"
"mov x20, x28\n"
- "st1 { v28.d }[0], [x21], x7\n"
- "st1 { v30.d }[0], [x20], x7\n"
"add x17, x17, #0x8\n"
"add x28, x28, #0x8\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 77f\n"
"mov x21, x17\n"
"mov x20, x28\n"
- "st1 { v28.s }[2], [x21], x7\n"
- "st1 { v30.s }[2], [x20], x7\n"
"add x17, x17, #0x4\n"
"add x28, x28, #0x4\n"
+ "st1 { v28.s }[2], [x21], x7\n"
+ "st1 { v30.s }[2], [x20], x7\n"
"st1 { v29.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 80f\n"
@@ -849,10 +849,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"tbz %x[n_channels], #1, 79f\n"
"mov x21, x17\n"
"mov x20, x28\n"
- "st1 { v28.s }[0], [x21], x7\n"
- "st1 { v30.s }[0], [x20], x7\n"
"add x17, x17, #0x4\n"
"add x28, x28, #0x4\n"
+ "st1 { v28.s }[0], [x21], x7\n"
+ "st1 { v30.s }[0], [x20], x7\n"
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 80f\n"
@@ -872,16 +872,16 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.h }[0], [x20]\n"
"80:" // Tile loop: Oddments: Store: Bit 2: End
"81:" // Tile loop: End
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x27, x27, #0x1\n"
- "add x21, x23, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x27, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x23, x23, x21, LT\n"
- "csel x27, x27, XZR, LT\n"
- "cmp x23, x20\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x10, x10, #0x1\n"
+ "add x20, x11, #0x1\n"
+ "cmp x10, x22\n"
+ "csel x11, x11, x20, LT\n"
+ "csel x10, x10, XZR, LT\n"
+ "cmp x11, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 6ae0b30afd..87a75b1026 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,275 +87,275 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "lsr x24, %x[n_channels], #0x3\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v26.8h }, [x20]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x3\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.8h }, [x21]\n"
"ld1r { v27.8h }, [x20]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x12, x11, [x21, #0x0]\n"
- "ldp x10, x9, [x21, #0x10]\n"
- "mov x28, #0x0\n"
- "sub x22, XZR, x25\n"
- "cbz x24, 3f\n"
- "ldr q31, [x23, #0x0]\n"
- "ldr q0, [x23, #0x10]\n"
- "cmp x25, x24, LSL #4\n"
- "ldr q1, [x23, #0x20]\n"
- "ldr q2, [x23, #0x30]\n"
- "ldr q3, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr q7, [x23, #0x80]\n"
- "ldr q8, [x23, #0x90]\n"
- "add x23, x23, #0xa0\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x28]\n"
- "ldr q10, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "ldr q11, [x21, x28]\n"
- "ldr q12, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x20]\n"
- "ldr q13, [x21, x28]\n"
- "ldr q14, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x30]\n"
- "ldr q15, [x21, x28]\n"
- "ldr q16, [x20, x28]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x22, #0x0]\n"
+ "ldp x11, x10, [x22, #0x10]\n"
+ "sub x9, XZR, x8\n"
+ "cbz x17, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x8, x17, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q9, [x27, x14]\n"
+ "ldr q10, [x26, x14]\n"
+ "ldr q11, [x25, x14]\n"
+ "ldr q12, [x24, x14]\n"
+ "ldr q13, [x23, x14]\n"
+ "ldr q14, [x22, x14]\n"
+ "ldr q15, [x21, x14]\n"
+ "ldr q16, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
- "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
- "ldr x21, [x13, #0x40]\n"
- "ldr x20, [x13, #0x48]\n"
- "fmla v24.8h, v0.8h, v10.8h\n"
- "fmla v23.8h, v1.8h, v12.8h\n"
- "ldr q20, [x20, x28]\n"
- "ldr x20, [x13, #0x50]\n"
- "fmla v24.8h, v1.8h, v11.8h\n"
- "ldr q19, [x21, x28]\n"
- "fmla v23.8h, v2.8h, v13.8h\n"
- "ldr q18, [x20, x28]\n"
- "fmla v24.8h, v3.8h, v14.8h\n"
- "fmla v23.8h, v0.8h, v16.8h\n"
- "ldr x20, [x13, #0x58]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v24.8h, v4.8h, v15.8h\n"
- "fmla v23.8h, v4.8h, v19.8h\n"
- "ldr x21, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "ldr q22, [x20, x28]\n"
- "fmla v24.8h, v2.8h, v16.8h\n"
- "fmla v23.8h, v5.8h, v20.8h\n"
- "ldr x20, [x13, #0x80]\n"
- "ldr q21, [x20, x28]\n"
- "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
- "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
- "ldr q31, [x23, #0x0]\n"
- "fmla v24.8h, v5.8h, v18.8h\n"
- "fmla v23.8h, v3.8h, v18.8h\n"
- "ldr q16, [x21, x28]\n"
- "ldr x20, [x13, #0x68]\n"
- "ldr q18, [x20, x28]\n"
- "fmla v20.8h, v3.8h, v17.8h\n"
- "fmla v19.8h, v4.8h, v16.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v20.8h, v0.8h, v22.8h\n"
- "ldr q0, [x23, #0x10]\n"
- "fmla v19.8h, v1.8h, v21.8h\n"
- "ldr x20, [x13, #0x70]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v20.8h, v4.8h, v18.8h\n"
- "fmla v19.8h, v5.8h, v16.8h\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr x20, [x13, #0x98]\n"
- "fmla v24.8h, v6.8h, v22.8h\n"
- "fmla v20.8h, v1.8h, v17.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr q1, [x23, #0x20]\n"
- "fmla v19.8h, v2.8h, v16.8h\n"
- "fmla v24.8h, v7.8h, v17.8h\n"
- "ldr q2, [x23, #0x30]\n"
- "ldr x20, [x13, #0x90]\n"
- "fmla v23.8h, v7.8h, v21.8h\n"
- "fmla v23.8h, v8.8h, v16.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0xa8]\n"
- "fmla v20.8h, v6.8h, v16.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "ldr x28, [x15, #0x40]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr x27, [x15, #0x78]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x24, [x15, #0x60]\n"
+ "ldr x26, [x15, #0x68]\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x21, x14]\n"
+ "ldr x23, [x15, #0x88]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x28, x14]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x25, x14]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr q23, [x23, x14]\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ldr q22, [x24, x14]\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v25.8h, v0.8h, v22.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla v28.8h, v5.8h, v21.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v25.8h, v4.8h, v17.8h\n"
+ "ldr q21, [x20, x14]\n"
+ "fmla v29.8h, v5.8h, v20.8h\n"
+ "fmla v28.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x27, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v24.8h, v4.8h, v16.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v29.8h, v6.8h, v22.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v25.8h, v1.8h, v19.8h\n"
+ "fmla v24.8h, v1.8h, v18.8h\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v29.8h, v7.8h, v19.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v24.8h, v5.8h, v23.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.8h, v8.8h, v21.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v24.8h, v2.8h, v21.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "fmla v25.8h, v7.8h, v20.8h\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "ldr q9, [x27, x8]\n"
+ "ldr q10, [x26, x8]\n"
+ "fmla v24.8h, v3.8h, v16.8h\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "ldr q12, [x24, x8]\n"
+ "ldr q13, [x23, x8]\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x20, x8]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "str q29, [x13, x9]\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "ldr q14, [x22, x8]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v25.8h, v8.8h, v18.8h\n"
+ "str q28, [x12, x9]\n"
+ "fmla v24.8h, v6.8h, v18.8h\n"
+ "ldr q15, [x21, x8]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "ldr q11, [x25, x8]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x8, x8, #0x10\n"
+ "add x16, x16, #0xa0\n"
+ "cmp x8, x17, LSL #4\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
"fmax v24.8h, v24.8h, v26.8h\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0xa0]\n"
- "fmla v19.8h, v3.8h, v17.8h\n"
- "fmax v23.8h, v23.8h, v26.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr q3, [x23, #0x40]\n"
- "fmla v20.8h, v7.8h, v16.8h\n"
- "fmla v20.8h, v5.8h, v17.8h\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr x20, [x13, #0xb0]\n"
- "add x22, x22, #0x10\n"
"fmin v24.8h, v24.8h, v27.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0xb8]\n"
- "fmla v19.8h, v7.8h, v16.8h\n"
- "fmin v23.8h, v23.8h, v27.8h\n"
- "ldr q16, [x20, x28]\n"
- "ldr q7, [x23, #0x80]\n"
- "fmla v19.8h, v6.8h, v16.8h\n"
- "fmla v20.8h, v8.8h, v16.8h\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr x20, [x13, #0xc0]\n"
- "fmax v20.8h, v20.8h, v26.8h\n"
- "fmin v20.8h, v20.8h, v27.8h\n"
- "ldr q16, [x20, x28]\n"
- "fmla v19.8h, v8.8h, v16.8h\n"
- "ldr q8, [x23, #0x90]\n"
- "fmax v19.8h, v19.8h, v26.8h\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x25]\n"
- "fmin v19.8h, v19.8h, v27.8h\n"
- "add x28, x28, #0x10\n"
- "ldr q10, [x20, x25]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "str q24, [x12, x22]\n"
- "add x23, x23, #0xa0\n"
- "ldr q11, [x21, x25]\n"
- "ldr q12, [x20, x25]\n"
- "str q23, [x11, x22]\n"
- "ldp x21, x20, [x13, #0x20]\n"
- "ldr q13, [x21, x25]\n"
- "str q20, [x10, x22]\n"
- "ldr q14, [x20, x25]\n"
- "ldp x21, x20, [x13, #0x30]\n"
- "str q19, [x9, x22]\n"
- "ldr q15, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
- "add x25, x25, #0x10\n"
- "cmp x25, x24, LSL #4\n"
+ "str q25, [x11, x9]\n"
+ "str q24, [x10, x9]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
- "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
- "ldr x21, [x13, #0x40]\n"
- "ldr x20, [x13, #0x48]\n"
- "fmla v25.8h, v0.8h, v10.8h\n"
- "fmla v24.8h, v1.8h, v12.8h\n"
- "ldr q20, [x20, x28]\n"
- "ldr x20, [x13, #0x50]\n"
- "fmla v25.8h, v1.8h, v11.8h\n"
- "ldr q18, [x21, x28]\n"
- "fmla v24.8h, v2.8h, v13.8h\n"
- "ldr q19, [x20, x28]\n"
- "fmla v25.8h, v3.8h, v14.8h\n"
- "fmla v24.8h, v0.8h, v16.8h\n"
- "ldr x20, [x13, #0x58]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v25.8h, v4.8h, v15.8h\n"
- "fmla v24.8h, v4.8h, v18.8h\n"
- "ldr x21, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "ldr q23, [x20, x28]\n"
- "fmla v25.8h, v2.8h, v16.8h\n"
- "fmla v24.8h, v5.8h, v20.8h\n"
- "ldr x20, [x13, #0x80]\n"
- "ldr q22, [x20, x28]\n"
- "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
- "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
- "ldr x20, [x13, #0x68]\n"
- "ldr q18, [x20, x28]\n"
- "fmla v25.8h, v5.8h, v19.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x28, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x26, [x15, #0x50]\n"
+ "ldr x25, [x15, #0x58]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v0.8h, v9.8h\n"
+ "ldr x27, [x15, #0x78]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x20, x14]\n"
+ "ldr x21, [x15, #0x88]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x28, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x26, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v15.8h\n"
+ "ldr q23, [x24, x14]\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x23, x14]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla v25.8h, v0.8h, v23.8h\n"
+ "fmla v29.8h, v5.8h, v21.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "fmla v29.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x27, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v24.8h, v4.8h, v16.8h\n"
+ "ldr q21, [x24, x14]\n"
+ "fmla v25.8h, v4.8h, v19.8h\n"
+ "ldr q20, [x25, x14]\n"
+ "fmla v28.8h, v6.8h, v23.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "fmla v25.8h, v1.8h, v18.8h\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v29.8h, v8.8h, v20.8h\n"
+ "fmla v24.8h, v5.8h, v22.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v25.8h, v6.8h, v16.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v25.8h, v7.8h, v21.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
"fmla v24.8h, v3.8h, v19.8h\n"
- "ldr q16, [x21, x28]\n"
- "fmla v21.8h, v3.8h, v17.8h\n"
- "fmla v20.8h, v4.8h, v16.8h\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.8h, v0.8h, v23.8h\n"
- "fmla v20.8h, v1.8h, v22.8h\n"
- "ldr x20, [x13, #0x70]\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x98]\n"
- "fmla v21.8h, v4.8h, v18.8h\n"
- "ldr q19, [x20, x28]\n"
- "fmla v20.8h, v5.8h, v16.8h\n"
- "fmla v25.8h, v6.8h, v23.8h\n"
- "ldr x20, [x13, #0x90]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.8h, v1.8h, v17.8h\n"
- "ldr x20, [x13, #0xa8]\n"
- "fmla v20.8h, v2.8h, v19.8h\n"
- "fmla v25.8h, v7.8h, v17.8h\n"
- "ldr q18, [x20, x28]\n"
- "ldr x20, [x13, #0xa0]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v21.8h, v6.8h, v16.8h\n"
- "fmla v20.8h, v3.8h, v18.8h\n"
- "ldr x20, [x13, #0xb0]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.8h, v7.8h, v17.8h\n"
- "fmla v20.8h, v7.8h, v16.8h\n"
- "ldr x20, [x13, #0xb8]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v24.8h, v7.8h, v22.8h\n"
- "fmla v21.8h, v5.8h, v18.8h\n"
- "ldr x20, [x13, #0xc0]\n"
- "fmla v20.8h, v6.8h, v17.8h\n"
- "fmla v24.8h, v8.8h, v19.8h\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.8h, v8.8h, v17.8h\n"
- "fmla v20.8h, v8.8h, v16.8h\n"
+ "str q28, [x13, x9]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "str q29, [x12, x9]\n"
+ "fmla v24.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
"fmax v25.8h, v25.8h, v26.8h\n"
- "add x22, x22, #0x10\n"
- "fmax v24.8h, v24.8h, v26.8h\n"
- "fmax v21.8h, v21.8h, v26.8h\n"
- "add x28, x28, #0x10\n"
- "fmax v20.8h, v20.8h, v26.8h\n"
"fmin v25.8h, v25.8h, v27.8h\n"
- "str q25, [x12, x22]\n"
+ "fmla v24.8h, v8.8h, v16.8h\n"
+ "str q25, [x11, x9]\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
"fmin v24.8h, v24.8h, v27.8h\n"
- "fmin v21.8h, v21.8h, v27.8h\n"
- "str q24, [x11, x22]\n"
- "fmin v20.8h, v20.8h, v27.8h\n"
- "str q21, [x10, x22]\n"
- "str q20, [x9, x22]\n"
+ "str q24, [x10, x9]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 80f\n"
- "ldr q31, [x23, #0x0]\n"
- "ldr q0, [x23, #0x10]\n"
- "mov x20, x28\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x20, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "add x13, x13, x20\n"
"add x12, x12, x20\n"
- "ldr q1, [x23, #0x20]\n"
- "ldr q2, [x23, #0x30]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"add x11, x11, x20\n"
"add x10, x10, x20\n"
- "ldr q3, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "add x9, x9, x20\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr q7, [x23, #0x80]\n"
- "ldr q8, [x23, #0x90]\n"
- "ldr x27, [x13, #0x0]\n"
- "ldr x26, [x13, #0x8]\n"
- "add x27, x27, x28\n"
- "add x26, x26, x28\n"
- "ldr x25, [x13, #0x10]\n"
- "ldr x24, [x13, #0x18]\n"
- "add x25, x25, x28\n"
- "add x24, x24, x28\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr x22, [x13, #0x28]\n"
- "add x23, x23, x28\n"
- "add x22, x22, x28\n"
- "ldr x21, [x13, #0x30]\n"
- "ldr x20, [x13, #0x38]\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "ldr x27, [x15, #0x0]\n"
+ "ldr x26, [x15, #0x8]\n"
+ "ldr x25, [x15, #0x10]\n"
+ "ldr x24, [x15, #0x18]\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v9.d }[0], [x27], #0x8\n"
"ld1 { v10.d }[0], [x26], #0x8\n"
@@ -426,19 +426,19 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v16.h }[0], [x20], #0x2\n"
"7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
"mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
- "fmla v28.8h, v0.8h, v10.8h\n"
- "ldr x20, [x13, #0x40]\n"
- "add x20, x20, x28\n"
"mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
- "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
- "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
"fmla v28.8h, v4.8h, v15.8h\n"
- "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
- "fmla v31.8h, v0.8h, v9.8h\n"
"fmla v28.8h, v2.8h, v16.8h\n"
- "fmla v29.8h, v0.8h, v16.8h\n"
"tbz %x[n_channels], #2, 9f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 8f\n"
@@ -459,9 +459,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"11:" // Oddments: Load input (1, 3): Bit 2: End
- "ldr x20, [x13, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v29.8h, v4.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -482,9 +482,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"15:" // Oddments: Load input (1, 4): Bit 2: End
- "ldr x20, [x13, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v29.8h, v5.8h, v12.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -505,10 +505,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"19:" // Oddments: Load input (1, 2): Bit 2: End
- "ldr x20, [x13, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v28.8h, v5.8h, v13.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -529,9 +529,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v14.h }[0], [x20], #0x2\n"
"23:" // Oddments: Load input (3, 0): Bit 2: End
- "ldr x20, [x13, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v30.8h, v3.8h, v14.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -552,10 +552,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v15.h }[0], [x20], #0x2\n"
"27:" // Oddments: Load input (2, 0): Bit 2: End
- "ldr x20, [x13, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v28.8h, v6.8h, v15.8h\n"
"fmla v30.8h, v0.8h, v15.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -576,9 +576,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"31:" // Oddments: Load input (3, 1): Bit 2: End
- "ldr x20, [x13, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v30.8h, v4.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -599,10 +599,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v16.h }[0], [x20], #0x2\n"
"35:" // Oddments: Load input (2, 1): Bit 2: End
- "ldr x20, [x13, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.8h, v7.8h, v16.8h\n"
"fmla v30.8h, v1.8h, v16.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -623,9 +623,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"38:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"39:" // Oddments: Load input (3, 3): Bit 2: End
- "ldr x20, [x13, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v31.8h, v4.8h, v13.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -646,10 +646,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v12.h }[0], [x20], #0x2\n"
"43:" // Oddments: Load input (2, 3): Bit 2: End
- "ldr x20, [x13, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v29.8h, v7.8h, v12.8h\n"
"fmla v31.8h, v1.8h, v12.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -670,9 +670,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v14.h }[0], [x20], #0x2\n"
"47:" // Oddments: Load input (3, 4): Bit 2: End
- "ldr x20, [x13, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v31.8h, v5.8h, v14.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -693,9 +693,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"50:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
"ld1 { v15.h }[0], [x20], #0x2\n"
"51:" // Oddments: Load input (4, 0): Bit 2: End
- "ldr x20, [x13, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.8h, v6.8h, v15.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -716,10 +716,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"54:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
"ld1 { v11.h }[0], [x20], #0x2\n"
"55:" // Oddments: Load input (2, 4): Bit 2: End
- "ldr x20, [x13, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v29.8h, v8.8h, v11.8h\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 57f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
@@ -740,9 +740,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"58:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
"ld1 { v13.h }[0], [x20], #0x2\n"
"59:" // Oddments: Load input (4, 1): Bit 2: End
- "ldr x20, [x13, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v30.8h, v7.8h, v13.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 61f\n"
"ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
@@ -763,10 +763,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"62:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v16.h }[0], [x20], #0x2\n"
"63:" // Oddments: Load input (3, 2): Bit 2: End
- "ldr x20, [x13, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v30.8h, v5.8h, v16.8h\n"
"fmla v31.8h, v3.8h, v16.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 65f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -787,9 +787,9 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"66:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
"ld1 { v14.h }[0], [x20], #0x2\n"
"67:" // Oddments: Load input (4, 3): Bit 2: End
- "ldr x20, [x13, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v31.8h, v7.8h, v14.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 69f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -810,10 +810,10 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"70:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
"ld1 { v15.h }[0], [x20], #0x2\n"
"71:" // Oddments: Load input (4, 2): Bit 2: End
- "ldr x20, [x13, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v30.8h, v8.8h, v15.8h\n"
"fmla v31.8h, v6.8h, v15.8h\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
@@ -838,56 +838,56 @@ void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"fmax v28.8h, v28.8h, v26.8h\n"
"fmax v29.8h, v29.8h, v26.8h\n"
"fmax v30.8h, v30.8h, v26.8h\n"
- "fmax v31.8h, v31.8h, v26.8h\n"
"fmin v28.8h, v28.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
"fmin v29.8h, v29.8h, v27.8h\n"
"fmin v30.8h, v30.8h, v27.8h\n"
"fmin v31.8h, v31.8h, v27.8h\n"
"tbz %x[n_channels], #2, 77f\n"
- "st1 { v28.d }[0], [x12], #0x8\n"
- "st1 { v29.d }[0], [x11], #0x8\n"
- "st1 { v30.d }[0], [x10], #0x8\n"
- "st1 { v31.d }[0], [x9], #0x8\n"
+ "st1 { v28.d }[0], [x13], #0x8\n"
+ "st1 { v29.d }[0], [x12], #0x8\n"
+ "st1 { v30.d }[0], [x11], #0x8\n"
+ "st1 { v31.d }[0], [x10], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
- "st1 { v28.s }[2], [x12], #0x4\n"
- "st1 { v29.s }[2], [x11], #0x4\n"
- "st1 { v30.s }[2], [x10], #0x4\n"
- "st1 { v31.s }[2], [x9], #0x4\n"
+ "st1 { v28.s }[2], [x13], #0x4\n"
+ "st1 { v29.s }[2], [x12], #0x4\n"
+ "st1 { v30.s }[2], [x11], #0x4\n"
+ "st1 { v31.s }[2], [x10], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[6], [x12], #0x2\n"
- "st1 { v29.h }[6], [x11], #0x2\n"
- "st1 { v30.h }[6], [x10], #0x2\n"
- "st1 { v31.h }[6], [x9], #0x2\n"
+ "st1 { v28.h }[6], [x13], #0x2\n"
+ "st1 { v29.h }[6], [x12], #0x2\n"
+ "st1 { v30.h }[6], [x11], #0x2\n"
+ "st1 { v31.h }[6], [x10], #0x2\n"
"b 79f\n"
"76:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[4], [x12], #0x2\n"
- "st1 { v29.h }[4], [x11], #0x2\n"
- "st1 { v30.h }[4], [x10], #0x2\n"
- "st1 { v31.h }[4], [x9], #0x2\n"
+ "st1 { v28.h }[4], [x13], #0x2\n"
+ "st1 { v29.h }[4], [x12], #0x2\n"
+ "st1 { v30.h }[4], [x11], #0x2\n"
+ "st1 { v31.h }[4], [x10], #0x2\n"
"b 79f\n"
"77:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 78f\n"
- "st1 { v28.s }[0], [x12], #0x4\n"
- "st1 { v29.s }[0], [x11], #0x4\n"
- "st1 { v30.s }[0], [x10], #0x4\n"
- "st1 { v31.s }[0], [x9], #0x4\n"
+ "st1 { v28.s }[0], [x13], #0x4\n"
+ "st1 { v29.s }[0], [x12], #0x4\n"
+ "st1 { v30.s }[0], [x11], #0x4\n"
+ "st1 { v31.s }[0], [x10], #0x4\n"
"tbz %x[n_channels], #0, 79f\n"
- "st1 { v28.h }[2], [x12], #0x2\n"
- "st1 { v29.h }[2], [x11], #0x2\n"
- "st1 { v30.h }[2], [x10], #0x2\n"
- "st1 { v31.h }[2], [x9], #0x2\n"
+ "st1 { v28.h }[2], [x13], #0x2\n"
+ "st1 { v29.h }[2], [x12], #0x2\n"
+ "st1 { v30.h }[2], [x11], #0x2\n"
+ "st1 { v31.h }[2], [x10], #0x2\n"
"b 79f\n"
"78:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "st1 { v28.h }[0], [x12], #0x2\n"
- "st1 { v29.h }[0], [x11], #0x2\n"
- "st1 { v30.h }[0], [x10], #0x2\n"
- "st1 { v31.h }[0], [x9], #0x2\n"
+ "st1 { v28.h }[0], [x13], #0x2\n"
+ "st1 { v29.h }[0], [x12], #0x2\n"
+ "st1 { v30.h }[0], [x11], #0x2\n"
+ "st1 { v31.h }[0], [x10], #0x2\n"
"79:" // Oddments: Store: Bit 2: End
"80:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index cecaf79704..f17beef55e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,251 +87,251 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x27, #0x0\n"
- "mov x26, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x10, #0x0\n"
"1:" // Tile loop
- "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x2\n"
- "mov x25, #0x2\n"
- "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x9, #0x2\n"
+ "mov x28, #0x2\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
"ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x2, x2, #0x1\n"
- "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
- "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x6, x2, x2\n"
- "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
- "add x4, x4, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x7, x4, x24, LSL #1\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
- "add x17, x7, x24, LSL #1\n"
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "mul x20, x20, x25\n" // offset *= output_tile_size
- "lsr x22, %x[n_channels], #0x3\n"
- "add x16, x17, x24, LSL #1\n"
- "add x15, x6, x2\n"
- "add x14, x16, x24, LSL #1\n"
- "add x13, x15, x2\n"
- "add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "lsr x24, %x[n_channels], #0x3\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v27.8h }, [x20]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v15.8h }, [x20]\n"
- "add x12, x14, x24, LSL #1\n"
- "add x11, x13, x2\n"
- "add x10, x5, x21, LSL #1\n"
+ "mul x22, x11, x27\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x26\n"
+ "mul x20, x11, x25\n" // offset = tile_i * ld_output_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x10, x2, x22\n" // offset += tile_j * ld_input_col
+ "lsl x2, x2, #0x1\n"
+ "madd x20, x10, x3, x20\n" // offset += tile_j * ld_output_col
"lsl x3, x3, #0x1\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q25, [x8, #0x0]\n"
- "ldr q0, [x8, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x8, #0x20]\n"
- "ldr q2, [x8, #0x30]\n"
- "ldr q3, [x8, #0x40]\n"
- "ldr q4, [x8, #0x50]\n"
- "add x8, x8, #0x60\n"
+ "mul x22, x22, x9\n" // offset *= kernel_stride * output_size
+ "add x7, x2, x2\n"
+ "add x8, x7, x2\n"
+ "add x17, x8, x2\n"
+ "mul x20, x20, x28\n" // offset *= output_tile_size
+ "add x16, x17, x2\n"
+ "add x4, x4, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x15, x4, x27, LSL #1\n"
+ "add x14, x15, x27, LSL #1\n"
+ "add x13, x14, x27, LSL #1\n"
+ "add x12, x13, x27, LSL #1\n"
+ "add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x11, x12, x27, LSL #1\n"
+ "add x10, x5, x25, LSL #1\n"
+ "cbz x24, 4f\n"
+ "ldr q25, [x6, #0x0]\n"
+ "ldr q0, [x6, #0x10]\n"
+ "cmp x26, x24, LSL #4\n"
+ "ldr q1, [x6, #0x20]\n"
+ "ldr q2, [x6, #0x30]\n"
+ "ldr q3, [x6, #0x40]\n"
+ "ldr q4, [x6, #0x50]\n"
+ "add x6, x6, #0x60\n"
"ld1 { v5.8h }, [x4]\n"
"ldr q6, [x4, x2]\n"
- "ld1 { v7.8h }, [x7]\n"
- "ldr q8, [x7, x2]\n"
- "ldr q9, [x4, x6]\n"
- "ldr q13, [x7, x6]\n"
- "ldr q11, [x4, x15]\n"
- "ldr q12, [x4, x13]\n"
- "ldr q10, [x7, x11]\n"
- "ld1 { v14.8h }, [x17]\n"
+ "ld1 { v7.8h }, [x15]\n"
+ "ldr q8, [x15, x2]\n"
+ "ldr q9, [x4, x7]\n"
+ "ldr q13, [x15, x7]\n"
+ "ldr q11, [x4, x8]\n"
+ "ldr q12, [x4, x17]\n"
+ "ldr q10, [x15, x16]\n"
+ "ld1 { v14.8h }, [x14]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
- "ldr q23, [x7, x15]\n"
+ "ldr q23, [x15, x8]\n"
"mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
- "add x23, x23, #0x10\n"
+ "add x26, x26, #0x10\n"
"mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
"mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
- "ldr q19, [x8, #0x0]\n"
- "ldr q25, [x8, #0x140]\n"
+ "ldr q19, [x6, #0x0]\n"
+ "ldr q25, [x6, #0x140]\n"
+ "cmp x26, x24, LSL #4\n"
+ "add x21, x21, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v30.8h, v1.8h, v6.8h\n"
- "ldr q21, [x7, x13]\n"
+ "ldr q21, [x15, x17]\n"
+ "add x15, x15, #0x10\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "add x7, x7, #0x10\n"
"fmla v29.8h, v1.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v13.8h\n"
- "ldr q1, [x8, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x6, #0x10]\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q18, [x4, x11]\n"
- "fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x4, x16]\n"
"add x4, x4, #0x10\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
"fmla v28.8h, v2.8h, v23.8h\n"
- "ldr q17, [x8, #0x20]\n"
- "add x20, x20, #0x10\n"
+ "ldr q17, [x6, #0x20]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "ldr q6, [x17, x2]\n"
+ "ldr q6, [x14, x2]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "add x21, x21, #0x10\n"
"fmla v29.8h, v3.8h, v23.8h\n"
"fmla v28.8h, v3.8h, v21.8h\n"
- "ldr q16, [x8, #0x30]\n"
+ "ldr q16, [x6, #0x30]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "ldr q2, [x17, x6]\n"
+ "ldr q2, [x14, x7]\n"
"fmla v31.8h, v4.8h, v18.8h\n"
- "ldr q0, [x17, x15]\n"
+ "ldr q0, [x14, x8]\n"
"fmla v29.8h, v4.8h, v21.8h\n"
"fmla v28.8h, v4.8h, v10.8h\n"
- "ldr q20, [x8, #0x40]\n"
+ "ldr q20, [x6, #0x40]\n"
"fmla v30.8h, v19.8h, v7.8h\n"
- "ld1 { v7.8h }, [x7]\n"
+ "ld1 { v7.8h }, [x15]\n"
"fmla v31.8h, v19.8h, v8.8h\n"
"fmla v29.8h, v19.8h, v14.8h\n"
"fmla v28.8h, v19.8h, v6.8h\n"
- "ldr q19, [x8, #0x50]\n"
+ "ldr q19, [x6, #0x50]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
- "ldr q26, [x17, x11]\n"
+ "ldr q26, [x14, x16]\n"
"fmla v31.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v6.8h\n"
"fmla v28.8h, v1.8h, v2.8h\n"
- "ldr q18, [x8, #0x60]\n"
+ "ldr q18, [x6, #0x60]\n"
"fmla v30.8h, v17.8h, v13.8h\n"
- "ldr q1, [x17, x13]\n"
+ "ldr q1, [x14, x17]\n"
+ "add x14, x14, #0x10\n"
"fmla v31.8h, v17.8h, v23.8h\n"
- "add x17, x17, #0x10\n"
"fmla v29.8h, v17.8h, v2.8h\n"
"fmla v28.8h, v17.8h, v0.8h\n"
- "ldr q17, [x8, #0x70]\n"
+ "ldr q17, [x6, #0x70]\n"
"fmla v30.8h, v16.8h, v23.8h\n"
- "ld1 { v24.8h }, [x16]\n"
+ "ld1 { v24.8h }, [x13]\n"
"fmla v31.8h, v16.8h, v21.8h\n"
"fmla v29.8h, v16.8h, v0.8h\n"
"fmla v28.8h, v16.8h, v1.8h\n"
- "ldr q16, [x8, #0x80]\n"
+ "ldr q16, [x6, #0x80]\n"
"fmla v30.8h, v20.8h, v21.8h\n"
- "ldr q23, [x16, x2]\n"
+ "ldr q23, [x13, x2]\n"
"fmla v31.8h, v20.8h, v10.8h\n"
- "ldr q22, [x16, x6]\n"
+ "ldr q22, [x13, x7]\n"
"fmla v29.8h, v20.8h, v1.8h\n"
"fmla v28.8h, v20.8h, v26.8h\n"
- "ldr q21, [x8, #0x90]\n"
+ "ldr q21, [x6, #0x90]\n"
"fmla v30.8h, v19.8h, v14.8h\n"
- "ldr q5, [x16, x11]\n"
+ "ldr q5, [x13, x16]\n"
"fmla v31.8h, v19.8h, v6.8h\n"
"fmla v29.8h, v19.8h, v24.8h\n"
"fmla v28.8h, v19.8h, v23.8h\n"
- "ldr q11, [x8, #0xa0]\n"
+ "ldr q11, [x6, #0xa0]\n"
"fmla v30.8h, v18.8h, v6.8h\n"
- "ldr q20, [x16, x15]\n"
+ "ldr q20, [x13, x8]\n"
"fmla v31.8h, v18.8h, v2.8h\n"
"fmla v29.8h, v18.8h, v23.8h\n"
"fmla v28.8h, v18.8h, v22.8h\n"
- "ldr q18, [x8, #0xb0]\n"
+ "ldr q18, [x6, #0xb0]\n"
"fmla v30.8h, v17.8h, v2.8h\n"
- "ldr q19, [x16, x13]\n"
+ "ldr q19, [x13, x17]\n"
+ "add x13, x13, #0x10\n"
"fmla v31.8h, v17.8h, v0.8h\n"
- "add x16, x16, #0x10\n"
"fmla v29.8h, v17.8h, v22.8h\n"
"fmla v28.8h, v17.8h, v20.8h\n"
- "ldr q17, [x8, #0xc0]\n"
+ "ldr q17, [x6, #0xc0]\n"
"fmla v30.8h, v16.8h, v0.8h\n"
- "ld1 { v0.8h }, [x14]\n"
+ "ld1 { v0.8h }, [x12]\n"
"fmla v31.8h, v16.8h, v1.8h\n"
"fmla v29.8h, v16.8h, v20.8h\n"
"fmla v28.8h, v16.8h, v19.8h\n"
- "ldr q16, [x8, #0xd0]\n"
+ "ldr q16, [x6, #0xd0]\n"
"fmla v30.8h, v21.8h, v1.8h\n"
- "ldr q4, [x14, x2]\n"
+ "ldr q4, [x12, x2]\n"
"fmla v31.8h, v21.8h, v26.8h\n"
- "ldr q12, [x14, x13]\n"
+ "ldr q12, [x12, x17]\n"
"fmla v29.8h, v21.8h, v19.8h\n"
"fmla v28.8h, v21.8h, v5.8h\n"
- "ldr q13, [x8, #0xe0]\n"
+ "ldr q13, [x6, #0xe0]\n"
"fmla v30.8h, v11.8h, v24.8h\n"
- "ldr q6, [x14, x6]\n"
+ "ldr q6, [x12, x7]\n"
"fmla v31.8h, v11.8h, v23.8h\n"
"fmla v29.8h, v11.8h, v0.8h\n"
"fmla v28.8h, v11.8h, v4.8h\n"
- "ldr q24, [x8, #0xf0]\n"
+ "ldr q24, [x6, #0xf0]\n"
"fmla v30.8h, v18.8h, v23.8h\n"
- "ldr q26, [x14, x15]\n"
+ "ldr q26, [x12, x8]\n"
"fmla v31.8h, v18.8h, v22.8h\n"
"fmla v29.8h, v18.8h, v4.8h\n"
"fmla v28.8h, v18.8h, v6.8h\n"
- "ldr q23, [x8, #0x100]\n"
+ "ldr q23, [x6, #0x100]\n"
"fmla v30.8h, v17.8h, v22.8h\n"
- "ldr q22, [x14, x11]\n"
+ "ldr q22, [x12, x16]\n"
+ "add x12, x12, #0x10\n"
"fmla v31.8h, v17.8h, v20.8h\n"
- "add x14, x14, #0x10\n"
"fmla v29.8h, v17.8h, v6.8h\n"
"fmla v28.8h, v17.8h, v26.8h\n"
- "ldr q21, [x8, #0x110]\n"
+ "ldr q21, [x6, #0x110]\n"
"fmla v30.8h, v16.8h, v20.8h\n"
- "ld1 { v18.8h }, [x12]\n"
+ "ld1 { v18.8h }, [x11]\n"
"fmla v31.8h, v16.8h, v19.8h\n"
"fmla v29.8h, v16.8h, v26.8h\n"
"fmla v28.8h, v16.8h, v12.8h\n"
- "ldr q20, [x8, #0x120]\n"
+ "ldr q20, [x6, #0x120]\n"
"fmla v30.8h, v13.8h, v19.8h\n"
- "ldr q17, [x12, x2]\n"
+ "ldr q17, [x11, x2]\n"
"fmla v31.8h, v13.8h, v5.8h\n"
- "ld1 { v14.8h }, [x17]\n"
+ "ld1 { v14.8h }, [x14]\n"
"fmla v29.8h, v13.8h, v12.8h\n"
"fmla v28.8h, v13.8h, v22.8h\n"
- "ldr q19, [x8, #0x130]\n"
+ "ldr q19, [x6, #0x130]\n"
"fmla v30.8h, v24.8h, v0.8h\n"
- "ldr q16, [x12, x6]\n"
+ "ldr q16, [x11, x7]\n"
"fmla v31.8h, v24.8h, v4.8h\n"
"fmla v29.8h, v24.8h, v18.8h\n"
- "ldr q18, [x12, x15]\n"
+ "ldr q18, [x11, x8]\n"
"fmla v28.8h, v24.8h, v17.8h\n"
- "ldr q0, [x8, #0x150]\n"
+ "ldr q0, [x6, #0x150]\n"
"fmla v30.8h, v23.8h, v4.8h\n"
- "ldr q13, [x7, x6]\n"
+ "ldr q13, [x15, x7]\n"
"fmla v31.8h, v23.8h, v6.8h\n"
"fmla v29.8h, v23.8h, v17.8h\n"
- "ldr q17, [x12, x13]\n"
+ "ldr q17, [x11, x17]\n"
"fmla v28.8h, v23.8h, v16.8h\n"
- "ldr q1, [x8, #0x160]\n"
+ "ldr q1, [x6, #0x160]\n"
"fmla v30.8h, v21.8h, v6.8h\n"
"ld1 { v5.8h }, [x4]\n"
"fmla v31.8h, v21.8h, v26.8h\n"
"fmla v29.8h, v21.8h, v16.8h\n"
- "ldr q16, [x12, x11]\n"
+ "ldr q16, [x11, x16]\n"
+ "add x11, x11, #0x10\n"
"fmla v28.8h, v21.8h, v18.8h\n"
- "ldr q2, [x8, #0x170]\n"
+ "ldr q2, [x6, #0x170]\n"
"fmla v30.8h, v20.8h, v26.8h\n"
"ldr q6, [x4, x2]\n"
"fmla v31.8h, v20.8h, v12.8h\n"
- "add x12, x12, #0x10\n"
"fmla v29.8h, v20.8h, v18.8h\n"
- "ldr q11, [x4, x15]\n"
+ "ldr q11, [x4, x8]\n"
"fmla v28.8h, v20.8h, v17.8h\n"
- "ldr q3, [x8, #0x180]\n"
+ "ldr q3, [x6, #0x180]\n"
"fmla v30.8h, v19.8h, v12.8h\n"
- "ldr q8, [x7, x2]\n"
+ "ldr q8, [x15, x2]\n"
"fmla v31.8h, v19.8h, v22.8h\n"
- "ldr q10, [x7, x11]\n"
+ "ldr q10, [x15, x16]\n"
"fmla v29.8h, v19.8h, v17.8h\n"
- "ldr q12, [x4, x13]\n"
+ "ldr q12, [x4, x17]\n"
"fmla v28.8h, v19.8h, v16.8h\n"
- "ldr q9, [x4, x6]\n"
- "ldr q4, [x8, #0x190]\n"
+ "ldr q9, [x4, x7]\n"
+ "ldr q4, [x6, #0x190]\n"
+ "add x6, x6, #0x1a0\n"
"fmax v30.8h, v30.8h, v27.8h\n"
"fmax v31.8h, v31.8h, v27.8h\n"
- "add x8, x8, #0x1a0\n"
"fmax v29.8h, v29.8h, v27.8h\n"
"fmax v28.8h, v28.8h, v27.8h\n"
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
- "st1 { v30.8h }, [x5]\n"
"fmin v29.8h, v29.8h, v15.8h\n"
"fmin v28.8h, v28.8h, v15.8h\n"
+ "st1 { v30.8h }, [x5]\n"
"str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v29.8h }, [x10]\n"
@@ -340,163 +340,163 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
- "ldr q22, [x7, x15]\n"
+ "ldr q22, [x15, x8]\n"
"mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
"mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
- "ldr q19, [x8, #0x0]\n"
+ "ldr q19, [x6, #0x0]\n"
"fmla v31.8h, v1.8h, v6.8h\n"
- "ldr q21, [x7, x13]\n"
+ "ldr q21, [x15, x17]\n"
+ "add x15, x15, #0x10\n"
"fmla v5.8h, v1.8h, v9.8h\n"
- "add x7, x7, #0x10\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q18, [x8, #0x10]\n"
+ "ldr q18, [x6, #0x10]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q16, [x4, x11]\n"
- "fmla v5.8h, v2.8h, v11.8h\n"
+ "ldr q16, [x4, x16]\n"
"add x4, x4, #0x10\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v29.8h, v2.8h, v22.8h\n"
- "ldr q17, [x8, #0x20]\n"
+ "ldr q17, [x6, #0x20]\n"
"fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q6, [x17, x2]\n"
+ "ldr q6, [x14, x2]\n"
"fmla v5.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v22.8h\n"
"fmla v29.8h, v3.8h, v21.8h\n"
- "ldr q20, [x8, #0x30]\n"
+ "ldr q20, [x6, #0x30]\n"
"fmla v31.8h, v4.8h, v12.8h\n"
- "ldr q2, [x17, x6]\n"
+ "ldr q2, [x14, x7]\n"
"fmla v5.8h, v4.8h, v16.8h\n"
- "ldr q28, [x17, x15]\n"
+ "ldr q28, [x14, x8]\n"
"fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q16, [x8, #0x40]\n"
+ "ldr q16, [x6, #0x40]\n"
"fmla v31.8h, v19.8h, v7.8h\n"
"fmla v5.8h, v19.8h, v8.8h\n"
"fmla v30.8h, v19.8h, v14.8h\n"
"fmla v29.8h, v19.8h, v6.8h\n"
- "ldr q19, [x8, #0x50]\n"
+ "ldr q19, [x6, #0x50]\n"
"fmla v31.8h, v18.8h, v8.8h\n"
- "ldr q1, [x17, x11]\n"
+ "ldr q1, [x14, x16]\n"
"fmla v5.8h, v18.8h, v13.8h\n"
"fmla v30.8h, v18.8h, v6.8h\n"
"fmla v29.8h, v18.8h, v2.8h\n"
- "ldr q18, [x8, #0x60]\n"
+ "ldr q18, [x6, #0x60]\n"
"fmla v31.8h, v17.8h, v13.8h\n"
- "ldr q26, [x17, x13]\n"
+ "ldr q26, [x14, x17]\n"
+ "add x14, x14, #0x10\n"
"fmla v5.8h, v17.8h, v22.8h\n"
- "add x17, x17, #0x10\n"
"fmla v30.8h, v17.8h, v2.8h\n"
"fmla v29.8h, v17.8h, v28.8h\n"
- "ldr q17, [x8, #0x70]\n"
+ "ldr q17, [x6, #0x70]\n"
"fmla v31.8h, v20.8h, v22.8h\n"
- "ld1 { v25.8h }, [x16]\n"
+ "ld1 { v25.8h }, [x13]\n"
"fmla v5.8h, v20.8h, v21.8h\n"
"fmla v30.8h, v20.8h, v28.8h\n"
"fmla v29.8h, v20.8h, v26.8h\n"
- "ldr q24, [x8, #0x80]\n"
+ "ldr q24, [x6, #0x80]\n"
"fmla v31.8h, v16.8h, v21.8h\n"
- "ldr q23, [x16, x2]\n"
+ "ldr q23, [x13, x2]\n"
"fmla v5.8h, v16.8h, v10.8h\n"
- "ldr q0, [x16, x6]\n"
+ "ldr q0, [x13, x7]\n"
"fmla v30.8h, v16.8h, v26.8h\n"
"fmla v29.8h, v16.8h, v1.8h\n"
- "ldr q22, [x8, #0x90]\n"
+ "ldr q22, [x6, #0x90]\n"
"fmla v31.8h, v19.8h, v14.8h\n"
- "ldr q16, [x16, x11]\n"
+ "ldr q16, [x13, x16]\n"
"fmla v5.8h, v19.8h, v6.8h\n"
"fmla v30.8h, v19.8h, v25.8h\n"
"fmla v29.8h, v19.8h, v23.8h\n"
- "ldr q21, [x8, #0xa0]\n"
+ "ldr q21, [x6, #0xa0]\n"
"fmla v31.8h, v18.8h, v6.8h\n"
- "ldr q20, [x16, x15]\n"
+ "ldr q20, [x13, x8]\n"
"fmla v5.8h, v18.8h, v2.8h\n"
"fmla v30.8h, v18.8h, v23.8h\n"
"fmla v29.8h, v18.8h, v0.8h\n"
- "ldr q18, [x8, #0xb0]\n"
+ "ldr q18, [x6, #0xb0]\n"
"fmla v31.8h, v17.8h, v2.8h\n"
- "ldr q19, [x16, x13]\n"
+ "ldr q19, [x13, x17]\n"
+ "add x13, x13, #0x10\n"
"fmla v5.8h, v17.8h, v28.8h\n"
- "add x16, x16, #0x10\n"
"fmla v30.8h, v17.8h, v0.8h\n"
"fmla v29.8h, v17.8h, v20.8h\n"
- "ldr q17, [x8, #0xc0]\n"
+ "ldr q17, [x6, #0xc0]\n"
"fmla v31.8h, v24.8h, v28.8h\n"
- "ld1 { v7.8h }, [x14]\n"
+ "ld1 { v7.8h }, [x12]\n"
"fmla v5.8h, v24.8h, v26.8h\n"
"fmla v30.8h, v24.8h, v20.8h\n"
"fmla v29.8h, v24.8h, v19.8h\n"
- "ldr q2, [x8, #0xd0]\n"
+ "ldr q2, [x6, #0xd0]\n"
"fmla v31.8h, v22.8h, v26.8h\n"
- "ldr q28, [x14, x2]\n"
+ "ldr q28, [x12, x2]\n"
"fmla v5.8h, v22.8h, v1.8h\n"
- "ldr q13, [x14, x13]\n"
+ "ldr q13, [x12, x17]\n"
"fmla v30.8h, v22.8h, v19.8h\n"
"fmla v29.8h, v22.8h, v16.8h\n"
- "ldr q14, [x8, #0xe0]\n"
+ "ldr q14, [x6, #0xe0]\n"
"fmla v31.8h, v21.8h, v25.8h\n"
- "ldr q26, [x14, x6]\n"
+ "ldr q26, [x12, x7]\n"
"fmla v5.8h, v21.8h, v23.8h\n"
"fmla v30.8h, v21.8h, v7.8h\n"
"fmla v29.8h, v21.8h, v28.8h\n"
- "ldr q25, [x8, #0xf0]\n"
+ "ldr q25, [x6, #0xf0]\n"
"fmla v31.8h, v18.8h, v23.8h\n"
- "ldr q24, [x14, x15]\n"
+ "ldr q24, [x12, x8]\n"
"fmla v5.8h, v18.8h, v0.8h\n"
"fmla v30.8h, v18.8h, v28.8h\n"
"fmla v29.8h, v18.8h, v26.8h\n"
- "ldr q23, [x8, #0x100]\n"
+ "ldr q23, [x6, #0x100]\n"
"fmla v31.8h, v17.8h, v0.8h\n"
- "ldr q22, [x14, x11]\n"
+ "ldr q22, [x12, x16]\n"
+ "add x12, x12, #0x10\n"
"fmla v5.8h, v17.8h, v20.8h\n"
- "add x14, x14, #0x10\n"
"fmla v30.8h, v17.8h, v26.8h\n"
"fmla v29.8h, v17.8h, v24.8h\n"
- "ldr q21, [x8, #0x110]\n"
+ "ldr q21, [x6, #0x110]\n"
"fmla v31.8h, v2.8h, v20.8h\n"
- "ld1 { v18.8h }, [x12]\n"
+ "ld1 { v18.8h }, [x11]\n"
"fmla v5.8h, v2.8h, v19.8h\n"
"fmla v30.8h, v2.8h, v24.8h\n"
"fmla v29.8h, v2.8h, v13.8h\n"
- "ldr q20, [x8, #0x120]\n"
+ "ldr q20, [x6, #0x120]\n"
"fmla v31.8h, v14.8h, v19.8h\n"
- "ldr q17, [x12, x2]\n"
+ "ldr q17, [x11, x2]\n"
"fmla v5.8h, v14.8h, v16.8h\n"
"fmla v30.8h, v14.8h, v13.8h\n"
"fmla v29.8h, v14.8h, v22.8h\n"
- "ldr q19, [x8, #0x130]\n"
- "add x8, x8, #0x140\n"
+ "ldr q19, [x6, #0x130]\n"
+ "add x6, x6, #0x140\n"
"fmla v31.8h, v25.8h, v7.8h\n"
- "ldr q16, [x12, x6]\n"
+ "ldr q16, [x11, x7]\n"
"fmla v5.8h, v25.8h, v28.8h\n"
"fmla v30.8h, v25.8h, v18.8h\n"
- "ldr q18, [x12, x15]\n"
+ "ldr q18, [x11, x8]\n"
"fmla v29.8h, v25.8h, v17.8h\n"
"fmla v31.8h, v23.8h, v28.8h\n"
"fmla v5.8h, v23.8h, v26.8h\n"
"fmla v30.8h, v23.8h, v17.8h\n"
- "ldr q17, [x12, x13]\n"
+ "ldr q17, [x11, x17]\n"
"fmla v29.8h, v23.8h, v16.8h\n"
"fmla v31.8h, v21.8h, v26.8h\n"
"fmla v5.8h, v21.8h, v24.8h\n"
"fmla v30.8h, v21.8h, v16.8h\n"
- "ldr q16, [x12, x11]\n"
+ "ldr q16, [x11, x16]\n"
+ "add x11, x11, #0x10\n"
"fmla v29.8h, v21.8h, v18.8h\n"
- "add x12, x12, #0x10\n"
"fmla v31.8h, v20.8h, v24.8h\n"
"fmla v5.8h, v20.8h, v13.8h\n"
"fmla v30.8h, v20.8h, v18.8h\n"
"fmla v29.8h, v20.8h, v17.8h\n"
"fmla v31.8h, v19.8h, v13.8h\n"
"fmla v5.8h, v19.8h, v22.8h\n"
- "fmax v31.8h, v31.8h, v27.8h\n"
"fmla v30.8h, v19.8h, v17.8h\n"
"fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"fmax v5.8h, v5.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
- "fmin v31.8h, v31.8h, v15.8h\n"
"fmin v5.8h, v5.8h, v15.8h\n"
"st1 { v31.8h }, [x5]\n"
"fmin v30.8h, v30.8h, v15.8h\n"
@@ -509,23 +509,23 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x7\n"
"beq 117f\n"
- "ldr q25, [x8, #0x0]\n"
- "ldr q0, [x8, #0x10]\n"
+ "ldr q25, [x6, #0x0]\n"
+ "ldr q0, [x6, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
- "ldr q1, [x8, #0x20]\n"
- "ldr q2, [x8, #0x30]\n"
- "add x27, x7, XZR\n"
- "add x26, x7, x2\n"
- "ldr q3, [x8, #0x40]\n"
- "ldr q4, [x8, #0x50]\n"
- "add x25, x4, x6\n"
- "add x24, x7, x6\n"
- "add x23, x4, x15\n"
- "add x22, x4, x13\n"
- "add x21, x7, x11\n"
- "add x20, x17, XZR\n"
- "add x8, x8, #0x60\n"
+ "ldr q1, [x6, #0x20]\n"
+ "ldr q2, [x6, #0x30]\n"
+ "add x27, x15, XZR\n"
+ "add x26, x15, x2\n"
+ "ldr q3, [x6, #0x40]\n"
+ "ldr q4, [x6, #0x50]\n"
+ "add x25, x4, x7\n"
+ "add x24, x15, x7\n"
+ "add x23, x4, x8\n"
+ "add x22, x4, x17\n"
+ "add x21, x15, x16\n"
+ "add x20, x14, XZR\n"
+ "add x6, x6, #0x60\n"
"tbz %x[n_channels], #2, 6f\n"
"ldr d5, [x9], #0x8\n"
"ldr d6, [x28], #0x8\n"
@@ -611,7 +611,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"8:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
"mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
- "add x20, x7, x15\n"
+ "add x20, x15, x8\n"
"mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
"fmla v28.8h, v1.8h, v6.8h\n"
@@ -643,7 +643,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
"fmla v31.8h, v2.8h, v5.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x7, x13\n"
+ "add x20, x15, x17\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v5.8h\n"
"tbz %x[n_channels], #2, 14f\n"
@@ -668,7 +668,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
"fmla v31.8h, v3.8h, v6.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x20, x4, x11\n"
+ "add x20, x4, x16\n"
"tbz %x[n_channels], #2, 18f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 17f\n"
@@ -689,13 +689,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"19:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset: Bit 1: Unset
"ldr h9, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v29.8h, v4.8h, v9.8h\n"
"fmla v30.8h, v4.8h, v6.8h\n"
- "add x20, x17, x2\n"
+ "add x20, x14, x2\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v0.8h, v7.8h\n"
- "add x8, x8, #0x10\n"
"fmla v29.8h, v0.8h, v8.8h\n"
"fmla v30.8h, v0.8h, v14.8h\n"
"tbz %x[n_channels], #2, 22f\n"
@@ -718,13 +718,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"23:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.8h, v0.8h, v11.8h\n"
+ "add x20, x14, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v1.8h, v8.8h\n"
- "add x20, x17, x6\n"
"fmla v29.8h, v1.8h, v13.8h\n"
"fmla v30.8h, v1.8h, v11.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 26f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 25f\n"
@@ -745,13 +745,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"27:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
"ldr h12, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
+ "add x20, x14, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v2.8h, v13.8h\n"
- "add x20, x17, x15\n"
"fmla v29.8h, v2.8h, v5.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 30f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 29f\n"
@@ -772,13 +772,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"31:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
"ldr h9, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
+ "add x20, x14, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v3.8h, v5.8h\n"
- "add x20, x17, x13\n"
"fmla v29.8h, v3.8h, v6.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 34f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 33f\n"
@@ -799,13 +799,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"35:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
"ldr h13, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
+ "add x20, x14, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v4.8h, v6.8h\n"
- "add x20, x17, x11\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 38f\n"
"ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -826,12 +826,12 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"39:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
"ldr h8, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.8h, v4.8h, v8.8h\n"
+ "add x20, x13, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v0.8h, v14.8h\n"
- "add x20, x16, XZR\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 42f\n"
"ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -853,7 +853,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h5, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
"fmla v30.8h, v0.8h, v5.8h\n"
- "add x20, x16, x2\n"
+ "add x20, x13, x2\n"
"tbz %x[n_channels], #2, 46f\n"
"ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #1, 45f\n"
@@ -874,13 +874,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"47:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
"ldr h6, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.8h, v0.8h, v6.8h\n"
+ "add x20, x13, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v1.8h, v11.8h\n"
- "add x20, x16, x6\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v30.8h, v1.8h, v6.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 50f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 49f\n"
@@ -901,13 +901,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"51:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
"ldr h10, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.8h, v1.8h, v10.8h\n"
+ "add x20, x13, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v2.8h, v12.8h\n"
- "add x20, x16, x15\n"
"fmla v29.8h, v2.8h, v9.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 54f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 53f\n"
@@ -928,13 +928,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"55:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "add x20, x13, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v3.8h, v9.8h\n"
- "add x20, x16, x13\n"
"fmla v29.8h, v3.8h, v13.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 58f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 57f\n"
@@ -955,13 +955,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"59:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
"ldr h12, [x20, #0x0]\n"
"60:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
+ "add x20, x13, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v4.8h, v13.8h\n"
- "add x20, x16, x11\n"
"fmla v29.8h, v4.8h, v8.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 62f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #1, 61f\n"
@@ -982,12 +982,12 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"63:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
"ldr h14, [x20, #0x0]\n"
"64:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.8h, v4.8h, v14.8h\n"
+ "add x20, x12, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v0.8h, v5.8h\n"
- "add x20, x14, XZR\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 66f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 65f\n"
@@ -1009,7 +1009,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h9, [x20, #0x0]\n"
"68:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
"fmla v30.8h, v0.8h, v9.8h\n"
- "add x20, x14, x2\n"
+ "add x20, x12, x2\n"
"tbz %x[n_channels], #2, 70f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #1, 69f\n"
@@ -1030,13 +1030,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"71:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
"ldr h13, [x20, #0x0]\n"
"72:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.8h, v0.8h, v13.8h\n"
+ "add x20, x12, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v1.8h, v6.8h\n"
- "add x20, x14, x6\n"
"fmla v29.8h, v1.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v13.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 74f\n"
"ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #1, 73f\n"
@@ -1057,13 +1057,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"75:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
"ldr h5, [x20, #0x0]\n"
"76:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
+ "add x20, x12, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v2.8h, v10.8h\n"
- "add x20, x14, x15\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v5.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 78f\n"
"ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #1, 77f\n"
@@ -1084,13 +1084,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"79:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
"ldr h6, [x20, #0x0]\n"
"80:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.8h, v2.8h, v6.8h\n"
+ "add x20, x12, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x14, x13\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v6.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 82f\n"
"ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #1, 81f\n"
@@ -1111,13 +1111,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"83:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
"ldr h8, [x20, #0x0]\n"
"84:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.8h, v3.8h, v8.8h\n"
+ "add x20, x12, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x20, x14, x11\n"
"fmla v29.8h, v4.8h, v14.8h\n"
"fmla v30.8h, v4.8h, v8.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 86f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #1, 85f\n"
@@ -1138,12 +1138,12 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"87:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
"ldr h10, [x20, #0x0]\n"
"88:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "add x20, x11, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v0.8h, v9.8h\n"
- "add x20, x12, XZR\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 90f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 89f\n"
@@ -1165,7 +1165,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr h11, [x20, #0x0]\n"
"92:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x20, x12, x2\n"
+ "add x20, x11, x2\n"
"tbz %x[n_channels], #2, 94f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 93f\n"
@@ -1186,13 +1186,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"95:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
"ldr h12, [x20, #0x0]\n"
"96:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
+ "add x20, x11, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v1.8h, v13.8h\n"
- "add x20, x12, x6\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 98f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #1, 97f\n"
@@ -1213,13 +1213,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"99:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
"ldr h9, [x20, #0x0]\n"
"100:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
+ "add x20, x11, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v2.8h, v5.8h\n"
- "add x20, x12, x15\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 102f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #1, 101f\n"
@@ -1240,13 +1240,13 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"103:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
"ldr h11, [x20, #0x0]\n"
"104:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "add x20, x11, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.8h, v3.8h, v6.8h\n"
- "add x20, x12, x13\n"
"fmla v29.8h, v3.8h, v8.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #2, 106f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #1, 105f\n"
@@ -1267,10 +1267,10 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"107:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
"ldr h12, [x20, #0x0]\n"
"108:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
+ "add x20, x11, x16\n"
"fmla v28.8h, v4.8h, v8.8h\n"
- "add x20, x12, x11\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
"tbz %x[n_channels], #2, 110f\n"
@@ -1297,27 +1297,27 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmax v28.8h, v28.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
- "fmax v31.8h, v31.8h, v27.8h\n"
"fmin v28.8h, v28.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"fmin v29.8h, v29.8h, v15.8h\n"
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 114f\n"
"mov x21, x5\n"
"mov x20, x10\n"
- "st1 { v28.d }[0], [x21], x3\n"
- "st1 { v30.d }[0], [x20], x3\n"
"add x5, x5, #0x8\n"
"add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #1, 113f\n"
"mov x21, x5\n"
"mov x20, x10\n"
- "st1 { v28.s }[2], [x21], x3\n"
- "st1 { v30.s }[2], [x20], x3\n"
"add x5, x5, #0x4\n"
"add x10, x10, #0x4\n"
+ "st1 { v28.s }[2], [x21], x3\n"
+ "st1 { v30.s }[2], [x20], x3\n"
"st1 { v29.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"tbz %x[n_channels], #0, 116f\n"
@@ -1341,10 +1341,10 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"tbz %x[n_channels], #1, 115f\n"
"mov x21, x5\n"
"mov x20, x10\n"
- "st1 { v28.s }[0], [x21], x3\n"
- "st1 { v30.s }[0], [x20], x3\n"
"add x5, x5, #0x4\n"
"add x10, x10, #0x4\n"
+ "st1 { v28.s }[0], [x21], x3\n"
+ "st1 { v30.s }[0], [x20], x3\n"
"st1 { v29.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"tbz %x[n_channels], #0, 116f\n"
@@ -1364,16 +1364,16 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.h }[0], [x20]\n"
"116:" // Tile loop: Oddments: Store: Bit 2: End
"117:" // Tile loop: End
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x26, x26, #0x1\n"
- "add x21, x27, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x27, x27, x21, LT\n"
- "csel x26, x26, XZR, LT\n"
- "cmp x27, x20\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x10, x10, #0x1\n"
+ "add x20, x11, #0x1\n"
+ "cmp x10, x22\n"
+ "csel x11, x11, x20, LT\n"
+ "csel x10, x10, XZR, LT\n"
+ "cmp x11, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 4913340c4c..c8a599b0a9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,478 +98,478 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x17, #0x10\n" // cntb _, ALL, #1
- "lsr x9, %x[n_channels], #0x3\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x3\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v27.8h }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.8h }, [x21]\n"
"ld1r { v15.8h }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "ldp x12, x11, [x21, #0x10]\n"
- "mov x10, #0x0\n"
- "sub x28, XZR, x17\n"
- "cbz x9, 3f\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x22, #0x0]\n"
+ "ldp x11, x10, [x22, #0x10]\n"
+ "sub x9, XZR, x8\n"
+ "cbz x17, 3f\n"
"ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "cmp x17, x9, LSL #4\n"
+ "cmp x8, x17, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q5, [x21, x10]\n"
- "ldr q6, [x20, x10]\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q7, [x21, x10]\n"
- "ldr q8, [x20, x10]\n"
- "ldp x21, x20, [x15, #0x20]\n"
- "ldr q9, [x21, x10]\n"
- "ldr q13, [x20, x10]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr q11, [x21, x10]\n"
- "ldr q12, [x20, x10]\n"
+ "ldr q5, [x27, x14]\n"
+ "ldr q6, [x26, x14]\n"
+ "ldr q7, [x25, x14]\n"
+ "ldr q8, [x24, x14]\n"
+ "ldr q9, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
"ldp x21, x20, [x15, #0x40]\n"
- "ldr q10, [x21, x10]\n"
- "ldr q14, [x20, x10]\n"
+ "ldr q10, [x21, x14]\n"
+ "ldr q14, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
"mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr q24, [x20, x10]\n"
- "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
- "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v8.8h\n"
"ldr q23, [x16, #0x0]\n"
"ldr q26, [x16, #0x140]\n"
+ "ldr x22, [x15, #0x60]\n"
+ "ldr x25, [x15, #0x68]\n"
+ "add x9, x9, #0x10\n"
+ "ldr q22, [x21, x14]\n"
+ "ldr x24, [x15, #0x70]\n"
"fmla v30.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x20, x14]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr q22, [x20, x10]\n"
- "fmla v28.8h, v1.8h, v8.8h\n"
- "fmla v29.8h, v1.8h, v13.8h\n"
- "ldr q21, [x16, #0x10]\n"
- "ldr x20, [x15, #0x60]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "ldr x23, [x15, #0x90]\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "ldr q17, [x20, x10]\n"
+ "ldr q18, [x22, x14]\n"
+ "ldr x26, [x15, #0x98]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
- "ldr x20, [x15, #0x68]\n"
- "fmla v28.8h, v2.8h, v13.8h\n"
- "fmla v29.8h, v2.8h, v24.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v28.8h, v2.8h, v22.8h\n"
"ldr q16, [x16, #0x20]\n"
- "ldr x22, [x15, #0x70]\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "ldr q5, [x20, x10]\n"
+ "ldr q20, [x25, x14]\n"
+ "ldr x25, [x15, #0xa8]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.8h, v3.8h, v24.8h\n"
"fmla v29.8h, v3.8h, v22.8h\n"
- "ldr q20, [x16, #0x30]\n"
- "ldr x21, [x15, #0x80]\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "ldr q17, [x16, #0x30]\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "ldr q19, [x22, x10]\n"
- "fmla v31.8h, v4.8h, v17.8h\n"
- "ldr q2, [x20, x10]\n"
- "fmla v28.8h, v4.8h, v22.8h\n"
- "fmla v29.8h, v4.8h, v10.8h\n"
- "ldr q18, [x16, #0x40]\n"
- "ldr x20, [x15, #0x88]\n"
+ "ldr q3, [x24, x14]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v31.8h, v4.8h, v18.8h\n"
+ "ldr q2, [x21, x14]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q19, [x16, #0x40]\n"
"fmla v30.8h, v23.8h, v7.8h\n"
"fmla v31.8h, v23.8h, v8.8h\n"
- "ldr x23, [x15, #0x90]\n"
- "ldr x26, [x15, #0x98]\n"
- "fmla v28.8h, v23.8h, v14.8h\n"
- "fmla v29.8h, v23.8h, v5.8h\n"
- "ldr q1, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa0]\n"
- "fmla v30.8h, v21.8h, v8.8h\n"
- "ldr q25, [x20, x10]\n"
- "fmla v31.8h, v21.8h, v13.8h\n"
- "ldr x25, [x15, #0xa8]\n"
- "fmla v28.8h, v21.8h, v5.8h\n"
- "fmla v29.8h, v21.8h, v19.8h\n"
- "ldr q17, [x16, #0x60]\n"
- "ldr x24, [x15, #0xb0]\n"
+ "fmla v29.8h, v23.8h, v14.8h\n"
+ "fmla v28.8h, v23.8h, v20.8h\n"
+ "ldr q18, [x16, #0x50]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "ldr q25, [x20, x14]\n"
+ "ldr x28, [x15, #0xc8]\n"
+ "fmla v31.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v20.8h\n"
+ "fmla v28.8h, v0.8h, v3.8h\n"
+ "ldr q11, [x16, #0x60]\n"
"fmla v30.8h, v16.8h, v13.8h\n"
- "ldr q8, [x21, x10]\n"
- "fmla v31.8h, v16.8h, v24.8h\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.8h, v16.8h, v19.8h\n"
- "fmla v29.8h, v16.8h, v2.8h\n"
+ "ldr q24, [x27, x14]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v31.8h, v16.8h, v22.8h\n"
+ "fmla v29.8h, v16.8h, v3.8h\n"
+ "fmla v28.8h, v16.8h, v2.8h\n"
"ldr q16, [x16, #0x70]\n"
- "ldr x21, [x15, #0xc0]\n"
- "fmla v30.8h, v20.8h, v24.8h\n"
- "ldr q24, [x23, x10]\n"
- "fmla v31.8h, v20.8h, v22.8h\n"
- "ldr x27, [x15, #0xc8]\n"
- "fmla v28.8h, v20.8h, v2.8h\n"
- "fmla v29.8h, v20.8h, v8.8h\n"
- "ldr q23, [x16, #0x80]\n"
+ "fmla v30.8h, v17.8h, v22.8h\n"
+ "ldr q5, [x23, x14]\n"
"ldr x23, [x15, #0xd0]\n"
- "fmla v30.8h, v18.8h, v22.8h\n"
- "ldr q22, [x26, x10]\n"
- "fmla v31.8h, v18.8h, v10.8h\n"
- "ldr q21, [x22, x10]\n"
- "fmla v28.8h, v18.8h, v8.8h\n"
- "fmla v29.8h, v18.8h, v25.8h\n"
- "ldr q20, [x16, #0x90]\n"
- "ldr x22, [x15, #0xd8]\n"
- "fmla v30.8h, v1.8h, v14.8h\n"
- "ldr q0, [x20, x10]\n"
- "fmla v31.8h, v1.8h, v5.8h\n"
- "ldr x20, [x15, #0xe0]\n"
- "fmla v28.8h, v1.8h, v24.8h\n"
- "fmla v29.8h, v1.8h, v22.8h\n"
- "ldr q6, [x16, #0xa0]\n"
+ "fmla v31.8h, v17.8h, v21.8h\n"
+ "fmla v29.8h, v17.8h, v2.8h\n"
+ "fmla v28.8h, v17.8h, v24.8h\n"
+ "ldr q17, [x16, #0x80]\n"
+ "fmla v30.8h, v19.8h, v21.8h\n"
+ "ldr q23, [x26, x14]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.8h, v19.8h, v10.8h\n"
+ "ldr q22, [x22, x14]\n"
+ "ldr x22, [x15, #0xe0]\n"
+ "fmla v29.8h, v19.8h, v24.8h\n"
+ "fmla v28.8h, v19.8h, v25.8h\n"
+ "ldr q21, [x16, #0x90]\n"
+ "fmla v30.8h, v18.8h, v14.8h\n"
+ "ldr q1, [x21, x14]\n"
"ldr x26, [x15, #0xf8]\n"
- "fmla v30.8h, v17.8h, v5.8h\n"
- "ldr q1, [x25, x10]\n"
- "fmla v31.8h, v17.8h, v19.8h\n"
+ "fmla v31.8h, v18.8h, v20.8h\n"
+ "fmla v29.8h, v18.8h, v5.8h\n"
+ "fmla v28.8h, v18.8h, v23.8h\n"
+ "ldr q12, [x16, #0xa0]\n"
+ "fmla v30.8h, v11.8h, v20.8h\n"
+ "ldr q0, [x25, x14]\n"
"ldr x25, [x15, #0xe8]\n"
- "fmla v28.8h, v17.8h, v22.8h\n"
- "fmla v29.8h, v17.8h, v21.8h\n"
- "ldr q18, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v30.8h, v16.8h, v19.8h\n"
- "ldr q19, [x24, x10]\n"
- "fmla v31.8h, v16.8h, v2.8h\n"
+ "fmla v31.8h, v11.8h, v3.8h\n"
+ "fmla v29.8h, v11.8h, v23.8h\n"
+ "fmla v28.8h, v11.8h, v22.8h\n"
+ "ldr q20, [x16, #0xb0]\n"
+ "fmla v30.8h, v16.8h, v3.8h\n"
+ "ldr q19, [x24, x14]\n"
"ldr x24, [x15, #0xf0]\n"
- "fmla v28.8h, v16.8h, v21.8h\n"
- "fmla v29.8h, v16.8h, v1.8h\n"
- "ldr q17, [x16, #0xc0]\n"
- "fmla v30.8h, v23.8h, v2.8h\n"
- "ldr q16, [x21, x10]\n"
- "fmla v31.8h, v23.8h, v8.8h\n"
+ "fmla v31.8h, v16.8h, v2.8h\n"
+ "fmla v29.8h, v16.8h, v22.8h\n"
+ "fmla v28.8h, v16.8h, v0.8h\n"
+ "ldr q18, [x16, #0xc0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "ldr q16, [x20, x14]\n"
"ldr x21, [x15, #0x100]\n"
- "fmla v28.8h, v23.8h, v1.8h\n"
- "fmla v29.8h, v23.8h, v19.8h\n"
- "ldr q13, [x16, #0xd0]\n"
- "fmla v30.8h, v20.8h, v8.8h\n"
- "ldr q2, [x27, x10]\n"
- "fmla v31.8h, v20.8h, v25.8h\n"
- "ldr q10, [x20, x10]\n"
- "fmla v28.8h, v20.8h, v19.8h\n"
- "fmla v29.8h, v20.8h, v0.8h\n"
- "ldr q9, [x16, #0xe0]\n"
+ "fmla v31.8h, v17.8h, v24.8h\n"
+ "fmla v29.8h, v17.8h, v0.8h\n"
+ "fmla v28.8h, v17.8h, v19.8h\n"
+ "ldr q17, [x16, #0xd0]\n"
+ "fmla v30.8h, v21.8h, v24.8h\n"
+ "ldr q14, [x28, x14]\n"
"ldr x20, [x15, #0x108]\n"
- "fmla v30.8h, v6.8h, v24.8h\n"
- "ldr q5, [x23, x10]\n"
- "fmla v31.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q4, [x22, x14]\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "fmla v28.8h, v21.8h, v1.8h\n"
+ "ldr q7, [x16, #0xe0]\n"
+ "fmla v30.8h, v12.8h, v5.8h\n"
+ "ldr q25, [x23, x14]\n"
"ldr x23, [x15, #0x110]\n"
- "fmla v28.8h, v6.8h, v16.8h\n"
- "fmla v29.8h, v6.8h, v2.8h\n"
- "ldr q24, [x16, #0xf0]\n"
- "fmla v30.8h, v18.8h, v22.8h\n"
- "ldr q25, [x22, x10]\n"
- "fmla v31.8h, v18.8h, v21.8h\n"
+ "fmla v31.8h, v12.8h, v23.8h\n"
+ "fmla v29.8h, v12.8h, v16.8h\n"
+ "fmla v28.8h, v12.8h, v14.8h\n"
+ "ldr q11, [x16, #0xf0]\n"
+ "fmla v30.8h, v20.8h, v23.8h\n"
+ "ldr q24, [x27, x14]\n"
"ldr x22, [x15, #0x118]\n"
- "fmla v28.8h, v18.8h, v2.8h\n"
- "fmla v29.8h, v18.8h, v5.8h\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "fmla v29.8h, v20.8h, v14.8h\n"
+ "fmla v28.8h, v20.8h, v25.8h\n"
"ldr q23, [x16, #0x100]\n"
- "fmla v30.8h, v17.8h, v21.8h\n"
- "ldr q22, [x25, x10]\n"
- "fmla v31.8h, v17.8h, v1.8h\n"
- "fmla v28.8h, v17.8h, v5.8h\n"
- "fmla v29.8h, v17.8h, v25.8h\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q22, [x25, x14]\n"
+ "fmla v31.8h, v18.8h, v0.8h\n"
+ "fmla v29.8h, v18.8h, v25.8h\n"
+ "fmla v28.8h, v18.8h, v24.8h\n"
"ldr q21, [x16, #0x110]\n"
- "fmla v30.8h, v13.8h, v1.8h\n"
- "ldr q18, [x24, x10]\n"
- "fmla v31.8h, v13.8h, v19.8h\n"
- "fmla v28.8h, v13.8h, v25.8h\n"
- "fmla v29.8h, v13.8h, v10.8h\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v31.8h, v17.8h, v19.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "fmla v28.8h, v17.8h, v4.8h\n"
"ldr q20, [x16, #0x120]\n"
- "fmla v30.8h, v9.8h, v19.8h\n"
- "ldr q17, [x26, x10]\n"
- "fmla v31.8h, v9.8h, v0.8h\n"
- "fmla v28.8h, v9.8h, v10.8h\n"
- "fmla v29.8h, v9.8h, v22.8h\n"
+ "fmla v30.8h, v7.8h, v19.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v31.8h, v7.8h, v1.8h\n"
+ "fmla v29.8h, v7.8h, v4.8h\n"
+ "fmla v28.8h, v7.8h, v22.8h\n"
"ldr q19, [x16, #0x130]\n"
- "fmla v30.8h, v24.8h, v16.8h\n"
- "ldr q16, [x21, x10]\n"
- "fmla v31.8h, v24.8h, v2.8h\n"
- "fmla v28.8h, v24.8h, v18.8h\n"
- "ldr q18, [x20, x10]\n"
- "fmla v29.8h, v24.8h, v17.8h\n"
+ "fmla v30.8h, v11.8h, v16.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v31.8h, v11.8h, v14.8h\n"
+ "fmla v29.8h, v11.8h, v18.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "ldp x20, x21, [x15, #0x0]\n"
+ "fmla v28.8h, v11.8h, v17.8h\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v30.8h, v23.8h, v2.8h\n"
- "fmla v31.8h, v23.8h, v5.8h\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "fmla v28.8h, v23.8h, v17.8h\n"
- "ldr q17, [x23, x10]\n"
- "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v30.8h, v23.8h, v14.8h\n"
+ "fmla v31.8h, v23.8h, v25.8h\n"
+ "fmla v29.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x23, x14]\n"
+ "fmla v28.8h, v23.8h, v16.8h\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v30.8h, v21.8h, v5.8h\n"
- "ldr q5, [x21, x17]\n"
- "fmla v31.8h, v21.8h, v25.8h\n"
- "fmla v28.8h, v21.8h, v16.8h\n"
- "ldr q16, [x22, x10]\n"
- "fmla v29.8h, v21.8h, v18.8h\n"
+ "fmla v30.8h, v21.8h, v25.8h\n"
+ "ldr q5, [x20, x8]\n"
+ "fmla v31.8h, v21.8h, v24.8h\n"
+ "fmla v29.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "ldp x20, x26, [x15, #0x10]\n"
+ "ldp x25, x24, [x15, #0x20]\n"
+ "ldp x23, x22, [x15, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "ldr q7, [x20, x8]\n"
+ "fmla v28.8h, v21.8h, v18.8h\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v30.8h, v20.8h, v25.8h\n"
- "ldr q6, [x20, x17]\n"
- "fmla v31.8h, v20.8h, v10.8h\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q7, [x21, x17]\n"
- "fmla v28.8h, v20.8h, v18.8h\n"
- "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v30.8h, v20.8h, v24.8h\n"
+ "ldr q6, [x21, x8]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q13, [x24, x8]\n"
+ "fmla v31.8h, v20.8h, v4.8h\n"
+ "fmla v29.8h, v20.8h, v18.8h\n"
+ "ldr q11, [x23, x8]\n"
+ "ldr q14, [x20, x8]\n"
+ "fmla v28.8h, v20.8h, v17.8h\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v30.8h, v19.8h, v10.8h\n"
- "ldr q8, [x20, x17]\n"
+ "fmla v30.8h, v19.8h, v4.8h\n"
+ "ldr q8, [x26, x8]\n"
"fmla v31.8h, v19.8h, v22.8h\n"
- "ldp x21, x20, [x15, #0x20]\n"
- "ldr q13, [x20, x17]\n"
- "fmla v28.8h, v19.8h, v17.8h\n"
- "fmla v29.8h, v19.8h, v16.8h\n"
- "ldr q9, [x21, x17]\n"
+ "ldr q10, [x21, x8]\n"
+ "fmla v29.8h, v19.8h, v17.8h\n"
+ "ldr q12, [x22, x8]\n"
+ "fmla v28.8h, v19.8h, v16.8h\n"
+ "ldr q9, [x25, x8]\n"
+ "add x8, x8, #0x10\n"
"ldr q4, [x16, #0x190]\n"
- "ldp x21, x20, [x15, #0x30]\n"
+ "cmp x8, x17, LSL #4\n"
+ "add x16, x16, #0x1a0\n"
"fmax v30.8h, v30.8h, v27.8h\n"
"fmax v31.8h, v31.8h, v27.8h\n"
- "ldr q11, [x21, x17]\n"
- "ldr q12, [x20, x17]\n"
- "fmax v28.8h, v28.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
- "ldp x21, x20, [x15, #0x40]\n"
- "ldr q10, [x21, x17]\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
- "ldr q14, [x20, x17]\n"
- "add x17, x17, #0x10\n"
- "cmp x17, x9, LSL #4\n"
- "fmin v28.8h, v28.8h, v15.8h\n"
"fmin v29.8h, v29.8h, v15.8h\n"
- "add x10, x10, #0x10\n"
- "str q30, [x14, x28]\n"
- "add x16, x16, #0x1a0\n"
- "str q31, [x13, x28]\n"
- "str q28, [x12, x28]\n"
- "str q29, [x11, x28]\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "str q30, [x13, x9]\n"
+ "str q31, [x12, x9]\n"
+ "str q29, [x11, x9]\n"
+ "str q28, [x10, x9]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
"mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr q22, [x20, x10]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "ldr x21, [x15, #0x58]\n"
"mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
"ldr q19, [x16, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x25, [x15, #0x68]\n"
+ "ldr x24, [x15, #0x70]\n"
+ "add x9, x9, #0x10\n"
+ "ldr q22, [x22, x14]\n"
+ "ldr x23, [x15, #0x78]\n"
"fmla v31.8h, v1.8h, v6.8h\n"
- "ldr q21, [x20, x10]\n"
+ "ldr q21, [x21, x14]\n"
"fmla v5.8h, v1.8h, v9.8h\n"
- "ldr x21, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v30.8h, v1.8h, v8.8h\n"
"fmla v29.8h, v1.8h, v13.8h\n"
"ldr q18, [x16, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
- "ldr q16, [x21, x10]\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v5.8h, v2.8h, v11.8h\n"
- "ldr x23, [x15, #0x70]\n"
"fmla v30.8h, v2.8h, v13.8h\n"
"fmla v29.8h, v2.8h, v22.8h\n"
"ldr q17, [x16, #0x20]\n"
- "ldr x21, [x15, #0x78]\n"
"fmla v31.8h, v3.8h, v11.8h\n"
- "ldr q6, [x20, x10]\n"
+ "ldr q6, [x25, x14]\n"
+ "ldr x25, [x15, #0xa8]\n"
"fmla v5.8h, v3.8h, v12.8h\n"
- "ldr x22, [x15, #0x80]\n"
"fmla v30.8h, v3.8h, v22.8h\n"
"fmla v29.8h, v3.8h, v21.8h\n"
"ldr q20, [x16, #0x30]\n"
- "ldr x20, [x15, #0x88]\n"
"fmla v31.8h, v4.8h, v12.8h\n"
- "ldr q2, [x23, x10]\n"
+ "ldr q2, [x24, x14]\n"
+ "ldr x24, [x15, #0xb0]\n"
"fmla v5.8h, v4.8h, v16.8h\n"
- "ldr q28, [x21, x10]\n"
+ "ldr q28, [x23, x14]\n"
+ "ldr x23, [x15, #0xb8]\n"
"fmla v30.8h, v4.8h, v21.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"ldr q16, [x16, #0x40]\n"
- "ldr x21, [x15, #0x90]\n"
"fmla v31.8h, v19.8h, v7.8h\n"
"fmla v5.8h, v19.8h, v8.8h\n"
- "ldr x27, [x15, #0x98]\n"
- "ldr x26, [x15, #0xa0]\n"
"fmla v30.8h, v19.8h, v14.8h\n"
"fmla v29.8h, v19.8h, v6.8h\n"
"ldr q19, [x16, #0x50]\n"
- "ldr x25, [x15, #0xa8]\n"
"fmla v31.8h, v18.8h, v8.8h\n"
- "ldr q1, [x20, x10]\n"
+ "ldr q1, [x22, x14]\n"
+ "ldr x28, [x15, #0xc8]\n"
"fmla v5.8h, v18.8h, v13.8h\n"
- "ldr x24, [x15, #0xb0]\n"
"fmla v30.8h, v18.8h, v6.8h\n"
"fmla v29.8h, v18.8h, v2.8h\n"
"ldr q18, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
"fmla v31.8h, v17.8h, v13.8h\n"
- "ldr q26, [x22, x10]\n"
+ "ldr q26, [x27, x14]\n"
+ "ldr x22, [x15, #0xc0]\n"
"fmla v5.8h, v17.8h, v22.8h\n"
- "ldr x23, [x15, #0xc0]\n"
"fmla v30.8h, v17.8h, v2.8h\n"
"fmla v29.8h, v17.8h, v28.8h\n"
"ldr q17, [x16, #0x70]\n"
- "ldr x22, [x15, #0xc8]\n"
"fmla v31.8h, v20.8h, v22.8h\n"
- "ldr q25, [x21, x10]\n"
- "fmla v5.8h, v20.8h, v21.8h\n"
+ "ldr q25, [x21, x14]\n"
"ldr x21, [x15, #0xd0]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
"fmla v30.8h, v20.8h, v28.8h\n"
"fmla v29.8h, v20.8h, v26.8h\n"
"ldr q24, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
"fmla v31.8h, v16.8h, v21.8h\n"
- "ldr q23, [x27, x10]\n"
+ "ldr q23, [x26, x14]\n"
+ "ldr x27, [x15, #0xd8]\n"
"fmla v5.8h, v16.8h, v10.8h\n"
- "ldr q0, [x26, x10]\n"
+ "ldr q0, [x20, x14]\n"
+ "ldr x20, [x15, #0xe0]\n"
"fmla v30.8h, v16.8h, v26.8h\n"
"fmla v29.8h, v16.8h, v1.8h\n"
"ldr q22, [x16, #0x90]\n"
- "ldr x27, [x15, #0xd8]\n"
"fmla v31.8h, v19.8h, v14.8h\n"
- "ldr q16, [x20, x10]\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0xf8]\n"
"fmla v5.8h, v19.8h, v6.8h\n"
- "ldr x20, [x15, #0xe0]\n"
"fmla v30.8h, v19.8h, v25.8h\n"
"fmla v29.8h, v19.8h, v23.8h\n"
"ldr q21, [x16, #0xa0]\n"
- "ldr x26, [x15, #0xf8]\n"
"fmla v31.8h, v18.8h, v6.8h\n"
- "ldr q20, [x25, x10]\n"
- "fmla v5.8h, v18.8h, v2.8h\n"
+ "ldr q20, [x25, x14]\n"
"ldr x25, [x15, #0xe8]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
"fmla v30.8h, v18.8h, v23.8h\n"
"fmla v29.8h, v18.8h, v0.8h\n"
"ldr q18, [x16, #0xb0]\n"
"fmla v31.8h, v17.8h, v2.8h\n"
- "ldr q19, [x24, x10]\n"
- "fmla v5.8h, v17.8h, v28.8h\n"
+ "ldr q19, [x24, x14]\n"
"ldr x24, [x15, #0xf0]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
"fmla v30.8h, v17.8h, v0.8h\n"
"fmla v29.8h, v17.8h, v20.8h\n"
"ldr q17, [x16, #0xc0]\n"
"fmla v31.8h, v24.8h, v28.8h\n"
- "ldr q7, [x23, x10]\n"
- "fmla v5.8h, v24.8h, v26.8h\n"
+ "ldr q10, [x22, x14]\n"
"ldr x23, [x15, #0x100]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
"fmla v30.8h, v24.8h, v20.8h\n"
"fmla v29.8h, v24.8h, v19.8h\n"
- "ldr q3, [x16, #0xd0]\n"
+ "ldr q13, [x16, #0xd0]\n"
"fmla v31.8h, v22.8h, v26.8h\n"
- "ldr q28, [x22, x10]\n"
+ "ldr q28, [x28, x14]\n"
+ "ldr x22, [x15, #0x108]\n"
"fmla v5.8h, v22.8h, v1.8h\n"
- "ldr q13, [x20, x10]\n"
+ "ldr q14, [x20, x14]\n"
"fmla v30.8h, v22.8h, v19.8h\n"
"fmla v29.8h, v22.8h, v16.8h\n"
- "ldr q11, [x16, #0xe0]\n"
- "ldr x22, [x15, #0x108]\n"
+ "ldr q12, [x16, #0xe0]\n"
"fmla v31.8h, v21.8h, v25.8h\n"
- "ldr q26, [x21, x10]\n"
- "fmla v5.8h, v21.8h, v23.8h\n"
+ "ldr q26, [x21, x14]\n"
"ldr x21, [x15, #0x110]\n"
- "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "fmla v30.8h, v21.8h, v10.8h\n"
"fmla v29.8h, v21.8h, v28.8h\n"
"ldr q25, [x16, #0xf0]\n"
"fmla v31.8h, v18.8h, v23.8h\n"
- "ldr q24, [x27, x10]\n"
- "fmla v5.8h, v18.8h, v0.8h\n"
+ "ldr q24, [x27, x14]\n"
"ldr x20, [x15, #0x118]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
"fmla v30.8h, v18.8h, v28.8h\n"
"fmla v29.8h, v18.8h, v26.8h\n"
"ldr q23, [x16, #0x100]\n"
"fmla v31.8h, v17.8h, v0.8h\n"
- "ldr q22, [x25, x10]\n"
+ "ldr q22, [x25, x14]\n"
"fmla v5.8h, v17.8h, v20.8h\n"
"fmla v30.8h, v17.8h, v26.8h\n"
"fmla v29.8h, v17.8h, v24.8h\n"
"ldr q21, [x16, #0x110]\n"
- "fmla v31.8h, v3.8h, v20.8h\n"
- "ldr q18, [x24, x10]\n"
- "fmla v5.8h, v3.8h, v19.8h\n"
- "fmla v30.8h, v3.8h, v24.8h\n"
- "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v31.8h, v13.8h, v20.8h\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v5.8h, v13.8h, v19.8h\n"
+ "fmla v30.8h, v13.8h, v24.8h\n"
+ "fmla v29.8h, v13.8h, v14.8h\n"
"ldr q20, [x16, #0x120]\n"
- "fmla v31.8h, v11.8h, v19.8h\n"
- "ldr q17, [x26, x10]\n"
- "fmla v5.8h, v11.8h, v16.8h\n"
- "fmla v30.8h, v11.8h, v13.8h\n"
- "fmla v29.8h, v11.8h, v22.8h\n"
+ "fmla v31.8h, v12.8h, v19.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v5.8h, v12.8h, v16.8h\n"
+ "fmla v30.8h, v12.8h, v14.8h\n"
+ "fmla v29.8h, v12.8h, v22.8h\n"
"ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v31.8h, v25.8h, v7.8h\n"
- "ldr q16, [x23, x10]\n"
+ "fmla v31.8h, v25.8h, v10.8h\n"
+ "ldr q16, [x23, x14]\n"
"fmla v5.8h, v25.8h, v28.8h\n"
"fmla v30.8h, v25.8h, v18.8h\n"
- "ldr q18, [x22, x10]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v29.8h, v25.8h, v17.8h\n"
"fmla v31.8h, v23.8h, v28.8h\n"
"fmla v5.8h, v23.8h, v26.8h\n"
"fmla v30.8h, v23.8h, v17.8h\n"
- "ldr q17, [x21, x10]\n"
+ "ldr q17, [x21, x14]\n"
"fmla v29.8h, v23.8h, v16.8h\n"
"fmla v31.8h, v21.8h, v26.8h\n"
"fmla v5.8h, v21.8h, v24.8h\n"
"fmla v30.8h, v21.8h, v16.8h\n"
- "ldr q16, [x20, x10]\n"
+ "ldr q16, [x20, x14]\n"
+ "add x14, x14, #0x10\n"
"fmla v29.8h, v21.8h, v18.8h\n"
- "add x10, x10, #0x10\n"
"fmla v31.8h, v20.8h, v24.8h\n"
- "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v5.8h, v20.8h, v14.8h\n"
"fmla v30.8h, v20.8h, v18.8h\n"
"fmla v29.8h, v20.8h, v17.8h\n"
- "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
"fmla v5.8h, v19.8h, v22.8h\n"
- "fmax v31.8h, v31.8h, v27.8h\n"
"fmla v30.8h, v19.8h, v17.8h\n"
"fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"fmax v5.8h, v5.8h, v27.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
- "fmax v29.8h, v29.8h, v27.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
"fmin v5.8h, v5.8h, v15.8h\n"
- "str q31, [x14, x28]\n"
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v29.8h, v29.8h, v15.8h\n"
- "str q5, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q29, [x11, x28]\n"
+ "str q31, [x13, x9]\n"
+ "str q5, [x12, x9]\n"
+ "str q30, [x11, x9]\n"
+ "str q29, [x10, x9]\n"
"3:" // Oddments
"tst %x[n_channels], #0x7\n"
"beq 116f\n"
"ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x20, x10\n"
- "add x14, x14, x20\n"
+ "mov x20, x14\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x20\n"
- "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
- "add x9, x9, x10\n"
- "add x28, x28, x10\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr x27, [x15, #0x10]\n"
"ldr x26, [x15, #0x18]\n"
- "add x27, x27, x10\n"
- "add x26, x26, x10\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr x25, [x15, #0x20]\n"
"ldr x24, [x15, #0x28]\n"
- "add x25, x25, x10\n"
- "add x24, x24, x10\n"
"ldr x23, [x15, #0x30]\n"
"ldr x22, [x15, #0x38]\n"
- "add x23, x23, x10\n"
- "add x22, x22, x10\n"
+ "add x9, x9, x14\n"
+ "add x28, x28, x14\n"
"ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x48]\n"
- "add x21, x21, x10\n"
- "add x20, x20, x10\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"add x16, x16, #0x60\n"
"tbz %x[n_channels], #2, 5f\n"
"ld1 { v5.d }[0], [x9], #0x8\n"
@@ -657,9 +657,9 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
"mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
"ldr x20, [x15, #0x50]\n"
- "add x20, x20, x10\n"
"mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
"mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "add x20, x20, x14\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v9.8h\n"
"fmla v30.8h, v1.8h, v8.8h\n"
@@ -690,9 +690,9 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x58]\n"
"fmla v31.8h, v2.8h, v5.8h\n"
"fmla v28.8h, v3.8h, v11.8h\n"
- "add x20, x20, x10\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v5.8h\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 13f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 12f\n"
@@ -716,7 +716,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x60]\n"
"fmla v31.8h, v3.8h, v6.8h\n"
"fmla v28.8h, v4.8h, v12.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 17f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
@@ -742,11 +742,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v30.8h, v4.8h, v6.8h\n"
"ldr x20, [x15, #0x68]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v0.8h, v7.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"fmla v29.8h, v0.8h, v8.8h\n"
"fmla v30.8h, v0.8h, v14.8h\n"
- "add x16, x16, #0x10\n"
"tbz %x[n_channels], #2, 21f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 20f\n"
@@ -770,11 +770,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0x70]\n"
"fmla v31.8h, v0.8h, v11.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v1.8h, v8.8h\n"
"fmla v29.8h, v1.8h, v13.8h\n"
"fmla v30.8h, v1.8h, v11.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 25f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 24f\n"
@@ -798,11 +798,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0x78]\n"
"fmla v31.8h, v1.8h, v12.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v2.8h, v13.8h\n"
"fmla v29.8h, v2.8h, v5.8h\n"
"fmla v30.8h, v2.8h, v12.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 29f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 28f\n"
@@ -826,11 +826,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0x80]\n"
"fmla v31.8h, v2.8h, v9.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v3.8h, v5.8h\n"
"fmla v29.8h, v3.8h, v6.8h\n"
"fmla v30.8h, v3.8h, v9.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 33f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 32f\n"
@@ -854,11 +854,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0x88]\n"
"fmla v31.8h, v3.8h, v13.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v4.8h, v6.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v13.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 37f\n"
"ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 36f\n"
@@ -882,10 +882,10 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0x90]\n"
"fmla v31.8h, v4.8h, v8.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v0.8h, v14.8h\n"
"fmla v29.8h, v0.8h, v11.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 41f\n"
"ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 40f\n"
@@ -908,7 +908,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"43:" // Oddments: Load input (3, 0): Bit 2: End
"ldr x20, [x15, #0x98]\n"
"fmla v30.8h, v0.8h, v5.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 45f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 44f\n"
@@ -932,11 +932,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0xa0]\n"
"fmla v31.8h, v0.8h, v6.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v1.8h, v11.8h\n"
"fmla v29.8h, v1.8h, v12.8h\n"
"fmla v30.8h, v1.8h, v6.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 49f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 48f\n"
@@ -960,11 +960,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0xa8]\n"
"fmla v31.8h, v1.8h, v10.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v2.8h, v12.8h\n"
"fmla v29.8h, v2.8h, v9.8h\n"
"fmla v30.8h, v2.8h, v10.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 53f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 52f\n"
@@ -988,11 +988,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0xb0]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v3.8h, v9.8h\n"
"fmla v29.8h, v3.8h, v13.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 57f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 56f\n"
@@ -1016,11 +1016,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0xb8]\n"
"fmla v31.8h, v3.8h, v12.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v4.8h, v13.8h\n"
"fmla v29.8h, v4.8h, v8.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 61f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 60f\n"
@@ -1044,10 +1044,10 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0xc0]\n"
"fmla v31.8h, v4.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v0.8h, v5.8h\n"
"fmla v29.8h, v0.8h, v6.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 65f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 64f\n"
@@ -1070,7 +1070,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"67:" // Oddments: Load input (4, 0): Bit 2: End
"ldr x20, [x15, #0xc8]\n"
"fmla v30.8h, v0.8h, v9.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 69f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 68f\n"
@@ -1094,11 +1094,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0xd0]\n"
"fmla v31.8h, v0.8h, v13.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v1.8h, v6.8h\n"
"fmla v29.8h, v1.8h, v10.8h\n"
"fmla v30.8h, v1.8h, v13.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 73f\n"
"ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 72f\n"
@@ -1122,11 +1122,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0xd8]\n"
"fmla v31.8h, v1.8h, v5.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v2.8h, v10.8h\n"
"fmla v29.8h, v2.8h, v11.8h\n"
"fmla v30.8h, v2.8h, v5.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 77f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 76f\n"
@@ -1150,11 +1150,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0xe0]\n"
"fmla v31.8h, v2.8h, v6.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v3.8h, v11.8h\n"
"fmla v29.8h, v3.8h, v12.8h\n"
"fmla v30.8h, v3.8h, v6.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 81f\n"
"ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 80f\n"
@@ -1178,11 +1178,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0xe8]\n"
"fmla v31.8h, v3.8h, v8.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v4.8h, v12.8h\n"
"fmla v29.8h, v4.8h, v14.8h\n"
"fmla v30.8h, v4.8h, v8.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 85f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 84f\n"
@@ -1206,10 +1206,10 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0xf0]\n"
"fmla v31.8h, v4.8h, v10.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v0.8h, v9.8h\n"
"fmla v29.8h, v0.8h, v13.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 89f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 88f\n"
@@ -1232,7 +1232,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"91:" // Oddments: Load input (5, 0): Bit 2: End
"ldr x20, [x15, #0xf8]\n"
"fmla v30.8h, v0.8h, v11.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 93f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 92f\n"
@@ -1256,11 +1256,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0x100]\n"
"fmla v31.8h, v0.8h, v12.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v1.8h, v13.8h\n"
"fmla v29.8h, v1.8h, v5.8h\n"
"fmla v30.8h, v1.8h, v12.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 97f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 96f\n"
@@ -1284,11 +1284,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0x108]\n"
"fmla v31.8h, v1.8h, v9.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v2.8h, v5.8h\n"
"fmla v29.8h, v2.8h, v6.8h\n"
"fmla v30.8h, v2.8h, v9.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 101f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 100f\n"
@@ -1312,11 +1312,11 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0x110]\n"
"fmla v31.8h, v2.8h, v11.8h\n"
+ "add x16, x16, #0x10\n"
"fmla v28.8h, v3.8h, v6.8h\n"
"fmla v29.8h, v3.8h, v8.8h\n"
"fmla v30.8h, v3.8h, v11.8h\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 105f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 104f\n"
@@ -1343,7 +1343,7 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v28.8h, v4.8h, v8.8h\n"
"fmla v29.8h, v4.8h, v10.8h\n"
"fmla v30.8h, v4.8h, v12.8h\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #2, 109f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #1, 108f\n"
@@ -1368,56 +1368,56 @@ void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmax v28.8h, v28.8h, v27.8h\n"
"fmax v29.8h, v29.8h, v27.8h\n"
"fmax v30.8h, v30.8h, v27.8h\n"
- "fmax v31.8h, v31.8h, v27.8h\n"
"fmin v28.8h, v28.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
"fmin v29.8h, v29.8h, v15.8h\n"
"fmin v30.8h, v30.8h, v15.8h\n"
"fmin v31.8h, v31.8h, v15.8h\n"
"tbz %x[n_channels], #2, 113f\n"
- "st1 { v28.d }[0], [x14], #0x8\n"
- "st1 { v29.d }[0], [x13], #0x8\n"
- "st1 { v30.d }[0], [x12], #0x8\n"
- "st1 { v31.d }[0], [x11], #0x8\n"
+ "st1 { v28.d }[0], [x13], #0x8\n"
+ "st1 { v29.d }[0], [x12], #0x8\n"
+ "st1 { v30.d }[0], [x11], #0x8\n"
+ "st1 { v31.d }[0], [x10], #0x8\n"
"tbz %x[n_channels], #1, 112f\n"
- "st1 { v28.s }[2], [x14], #0x4\n"
- "st1 { v29.s }[2], [x13], #0x4\n"
- "st1 { v30.s }[2], [x12], #0x4\n"
- "st1 { v31.s }[2], [x11], #0x4\n"
+ "st1 { v28.s }[2], [x13], #0x4\n"
+ "st1 { v29.s }[2], [x12], #0x4\n"
+ "st1 { v30.s }[2], [x11], #0x4\n"
+ "st1 { v31.s }[2], [x10], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[6], [x14], #0x2\n"
- "st1 { v29.h }[6], [x13], #0x2\n"
- "st1 { v30.h }[6], [x12], #0x2\n"
- "st1 { v31.h }[6], [x11], #0x2\n"
+ "st1 { v28.h }[6], [x13], #0x2\n"
+ "st1 { v29.h }[6], [x12], #0x2\n"
+ "st1 { v30.h }[6], [x11], #0x2\n"
+ "st1 { v31.h }[6], [x10], #0x2\n"
"b 115f\n"
"112:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[4], [x14], #0x2\n"
- "st1 { v29.h }[4], [x13], #0x2\n"
- "st1 { v30.h }[4], [x12], #0x2\n"
- "st1 { v31.h }[4], [x11], #0x2\n"
+ "st1 { v28.h }[4], [x13], #0x2\n"
+ "st1 { v29.h }[4], [x12], #0x2\n"
+ "st1 { v30.h }[4], [x11], #0x2\n"
+ "st1 { v31.h }[4], [x10], #0x2\n"
"b 115f\n"
"113:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 114f\n"
- "st1 { v28.s }[0], [x14], #0x4\n"
- "st1 { v29.s }[0], [x13], #0x4\n"
- "st1 { v30.s }[0], [x12], #0x4\n"
- "st1 { v31.s }[0], [x11], #0x4\n"
+ "st1 { v28.s }[0], [x13], #0x4\n"
+ "st1 { v29.s }[0], [x12], #0x4\n"
+ "st1 { v30.s }[0], [x11], #0x4\n"
+ "st1 { v31.s }[0], [x10], #0x4\n"
"tbz %x[n_channels], #0, 115f\n"
- "st1 { v28.h }[2], [x14], #0x2\n"
- "st1 { v29.h }[2], [x13], #0x2\n"
- "st1 { v30.h }[2], [x12], #0x2\n"
- "st1 { v31.h }[2], [x11], #0x2\n"
+ "st1 { v28.h }[2], [x13], #0x2\n"
+ "st1 { v29.h }[2], [x12], #0x2\n"
+ "st1 { v30.h }[2], [x11], #0x2\n"
+ "st1 { v31.h }[2], [x10], #0x2\n"
"b 115f\n"
"114:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
- "st1 { v28.h }[0], [x14], #0x2\n"
- "st1 { v29.h }[0], [x13], #0x2\n"
- "st1 { v30.h }[0], [x12], #0x2\n"
- "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v28.h }[0], [x13], #0x2\n"
+ "st1 { v29.h }[0], [x12], #0x2\n"
+ "st1 { v30.h }[0], [x11], #0x2\n"
+ "st1 { v31.h }[0], [x10], #0x2\n"
"115:" // Oddments: Store: Bit 2: End
"116:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 08f40b785f..e4c8793b75 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,97 +56,97 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x26, %x[inptrs]\n"
- "ldp x21, x20, [x26], #0x10\n"
- "subs x25, %x[n_points], #0x1\n"
- "ldr q14, [x21, x11]\n"
- "ldr q15, [x20, x11]\n"
+ "mov x23, %x[inptrs]\n"
+ "subs x22, %x[n_points], #0x1\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x21, x20, [x26], #0x10\n"
- "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
+ "add %x[params], %x[params], #0x10\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x21, x11]\n"
- "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x21, x20, [x26], #0x10\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ldr q20, [x21, x11]\n"
- "add %x[params], %x[params], #0x10\n"
"ldr q21, [x20, x11]\n"
- "ldr x20, [x26], #0x8\n"
+ "ldr x20, [x23], #0x8\n"
"ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x20, x24, [x26], #0x10\n"
- "ldp x23, x22, [x26], #0x10\n"
- "subs x25, x25, #0x1\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "subs x22, x22, #0x1\n"
"fmla v23.8h, v14.8h, v0.8h\n"
- "ldr q14, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"fmla v24.8h, v15.8h, v0.8h\n"
"fmla v25.8h, v16.8h, v0.8h\n"
- "ldr q15, [x24, x11]\n"
- "ldr q16, [x23, x11]\n"
"fmla v26.8h, v17.8h, v0.8h\n"
"fmla v27.8h, v18.8h, v0.8h\n"
- "ldr q17, [x22, x11]\n"
- "ldr q18, [x21, x11]\n"
"fmla v28.8h, v19.8h, v0.8h\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"fmla v29.8h, v20.8h, v0.8h\n"
- "ldr q19, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"fmla v30.8h, v21.8h, v0.8h\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"ldr q21, [x20, x11]\n"
- "ldr x20, [x26], #0x8\n"
+ "ldr x20, [x23], #0x8\n"
"ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.8h, v14.8h, v0.8h\n"
"fmla v24.8h, v15.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v2.8h\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
"fmla v25.8h, v16.8h, v0.8h\n"
"fmla v26.8h, v17.8h, v0.8h\n"
- "fmax v24.8h, v24.8h, v2.8h\n"
- "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v27.8h, v18.8h, v0.8h\n"
"fmla v28.8h, v19.8h, v0.8h\n"
- "fmax v25.8h, v25.8h, v2.8h\n"
- "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"fmla v29.8h, v20.8h, v0.8h\n"
"fmla v30.8h, v21.8h, v0.8h\n"
- "fmax v26.8h, v26.8h, v2.8h\n"
- "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
"fmax v27.8h, v27.8h, v2.8h\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
"fmax v28.8h, v28.8h, v2.8h\n"
"fmax v29.8h, v29.8h, v2.8h\n"
"fmax v30.8h, v30.8h, v2.8h\n"
"fmax v31.8h, v31.8h, v2.8h\n"
"fmin v23.8h, v23.8h, v1.8h\n"
"fmin v24.8h, v24.8h, v1.8h\n"
- "str q23, [x28, x11]\n"
"fmin v25.8h, v25.8h, v1.8h\n"
"fmin v26.8h, v26.8h, v1.8h\n"
- "str q24, [x27, x11]\n"
"fmin v27.8h, v27.8h, v1.8h\n"
"fmin v28.8h, v28.8h, v1.8h\n"
- "str q25, [x26, x11]\n"
"fmin v29.8h, v29.8h, v1.8h\n"
"fmin v30.8h, v30.8h, v1.8h\n"
- "str q26, [x25, x11]\n"
+ "str q23, [x28, x11]\n"
"fmin v31.8h, v31.8h, v1.8h\n"
+ "str q24, [x27, x11]\n"
+ "str q25, [x26, x11]\n"
+ "str q26, [x25, x11]\n"
"str q27, [x24, x11]\n"
"str q28, [x23, x11]\n"
"str q29, [x22, x11]\n"
@@ -184,29 +184,29 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"10:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x27, x26, [x10], #0x10\n"
- "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x23, x22, [x10], #0x10\n"
- "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
+ "add %x[params], %x[params], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
"add x9, x9, x11\n"
"add x28, x28, x11\n"
- "mov v31.16b, v23.16b\n"
+ "ldp x25, x24, [x10], #0x10\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 12f\n"
"ldr d14, [x9], #0x8\n"
"ldr d15, [x28], #0x8\n"
@@ -287,30 +287,30 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"ble 20f\n"
"15:" // Oddments: Planar loop
"ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
"fmla v23.8h, v14.8h, v0.8h\n"
"fmla v24.8h, v15.8h, v0.8h\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x10], #0x10\n"
"fmla v25.8h, v16.8h, v0.8h\n"
"fmla v26.8h, v17.8h, v0.8h\n"
- "ldr x21, [x10], #0x8\n"
"fmla v27.8h, v18.8h, v0.8h\n"
"fmla v28.8h, v19.8h, v0.8h\n"
- "add x9, x9, x11\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v29.8h, v20.8h, v0.8h\n"
"fmla v30.8h, v21.8h, v0.8h\n"
+ "add x9, x9, x11\n"
"add x28, x28, x11\n"
- "add x27, x27, x11\n"
"fmla v31.8h, v22.8h, v0.8h\n"
"ldr q0, [%x[params], #0x0]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #2, 17f\n"
"ldr d14, [x9], #0x8\n"
"ldr d15, [x28], #0x8\n"
@@ -392,40 +392,40 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
"20:" // Oddments: Planar tail
"fmla v23.8h, v14.8h, v0.8h\n"
"fmla v24.8h, v15.8h, v0.8h\n"
- "fmax v23.8h, v23.8h, v2.8h\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
"fmla v25.8h, v16.8h, v0.8h\n"
"fmla v26.8h, v17.8h, v0.8h\n"
- "fmax v24.8h, v24.8h, v2.8h\n"
- "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v27.8h, v18.8h, v0.8h\n"
"fmla v28.8h, v19.8h, v0.8h\n"
- "fmax v25.8h, v25.8h, v2.8h\n"
- "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"fmla v29.8h, v20.8h, v0.8h\n"
"fmla v30.8h, v21.8h, v0.8h\n"
- "fmax v26.8h, v26.8h, v2.8h\n"
- "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
"fmax v27.8h, v27.8h, v2.8h\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x28, x28, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"fmax v28.8h, v28.8h, v2.8h\n"
"fmax v29.8h, v29.8h, v2.8h\n"
- "add x27, x27, x11\n"
- "add x26, x26, x11\n"
+ "add x20, x20, x11\n"
"fmax v30.8h, v30.8h, v2.8h\n"
"fmax v31.8h, v31.8h, v2.8h\n"
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
"fmin v23.8h, v23.8h, v1.8h\n"
"fmin v24.8h, v24.8h, v1.8h\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
"fmin v25.8h, v25.8h, v1.8h\n"
"fmin v26.8h, v26.8h, v1.8h\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
"fmin v27.8h, v27.8h, v1.8h\n"
"fmin v28.8h, v28.8h, v1.8h\n"
"fmin v29.8h, v29.8h, v1.8h\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index cee3fb59c5..d3a2e06453 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,20 +58,20 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"2:" // Output channel loop: Load bias: Done
"ldr q6, [%x[weights], #0x0]\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
- "ldr q1, [x21, #0x0]\n"
- "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
- "mov v19.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
+ "mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
+ "ldp x21, x20, [x22], #0x10\n"
"mov v21.16b, v31.16b\n"
"mov v22.16b, v31.16b\n"
"mov v23.16b, v31.16b\n"
"mov v24.16b, v31.16b\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v25.16b, v31.16b\n"
"mov v26.16b, v31.16b\n"
"mov v27.16b, v31.16b\n"
@@ -160,71 +160,71 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v31.8h, v6.8h, v0.h[7]\n"
"fmla v16.8h, v5.8h, v4.h[0]\n"
"fmla v17.8h, v5.8h, v4.h[1]\n"
- "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v18.8h, v5.8h, v4.h[2]\n"
"fmla v19.8h, v5.8h, v4.h[3]\n"
- "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v20.8h, v5.8h, v4.h[4]\n"
"fmla v21.8h, v5.8h, v4.h[5]\n"
- "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v22.8h, v5.8h, v4.h[6]\n"
"fmla v23.8h, v5.8h, v4.h[7]\n"
- "fmin v19.8h, v19.8h, v7.8h\n"
"fmla v24.8h, v5.8h, v3.h[0]\n"
"fmla v25.8h, v5.8h, v3.h[1]\n"
- "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v26.8h, v5.8h, v3.h[2]\n"
"fmla v27.8h, v5.8h, v3.h[3]\n"
- "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v28.8h, v5.8h, v3.h[4]\n"
"fmla v29.8h, v5.8h, v3.h[5]\n"
- "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v30.8h, v5.8h, v3.h[6]\n"
"fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
"fmin v23.8h, v23.8h, v7.8h\n"
"fmax v16.8h, v16.8h, v8.8h\n"
"fmax v17.8h, v17.8h, v8.8h\n"
- "str q16, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v18.8h, v18.8h, v8.8h\n"
"fmax v19.8h, v19.8h, v8.8h\n"
- "str q17, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v20.8h, v20.8h, v8.8h\n"
"fmax v21.8h, v21.8h, v8.8h\n"
- "str q18, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"fmax v22.8h, v22.8h, v8.8h\n"
"fmax v23.8h, v23.8h, v8.8h\n"
- "str q19, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.8h, v24.8h, v7.8h\n"
"fmin v25.8h, v25.8h, v7.8h\n"
- "str q20, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.8h, v26.8h, v7.8h\n"
"fmin v27.8h, v27.8h, v7.8h\n"
- "str q21, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.8h, v28.8h, v7.8h\n"
"fmin v29.8h, v29.8h, v7.8h\n"
- "str q22, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.8h, v30.8h, v7.8h\n"
"fmin v31.8h, v31.8h, v7.8h\n"
- "str q23, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.8h, v24.8h, v8.8h\n"
"fmax v25.8h, v25.8h, v8.8h\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.8h, v26.8h, v8.8h\n"
"fmax v27.8h, v27.8h, v8.8h\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.8h, v28.8h, v8.8h\n"
"fmax v29.8h, v29.8h, v8.8h\n"
- "str q26, [x25, x28]\n"
"fmax v30.8h, v30.8h, v8.8h\n"
"fmax v31.8h, v31.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -280,71 +280,71 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v31.8h, v5.8h, v3.h[7]\n"
"fmla v16.8h, v1.8h, v2.h[0]\n"
"fmla v17.8h, v1.8h, v2.h[1]\n"
- "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v18.8h, v1.8h, v2.h[2]\n"
"fmla v19.8h, v1.8h, v2.h[3]\n"
- "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v20.8h, v1.8h, v2.h[4]\n"
"fmla v21.8h, v1.8h, v2.h[5]\n"
- "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v22.8h, v1.8h, v2.h[6]\n"
"fmla v23.8h, v1.8h, v2.h[7]\n"
- "fmin v19.8h, v19.8h, v7.8h\n"
"fmla v24.8h, v1.8h, v0.h[0]\n"
"fmla v25.8h, v1.8h, v0.h[1]\n"
- "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
"fmla v26.8h, v1.8h, v0.h[2]\n"
"fmla v27.8h, v1.8h, v0.h[3]\n"
- "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v28.8h, v1.8h, v0.h[4]\n"
"fmla v29.8h, v1.8h, v0.h[5]\n"
- "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v30.8h, v1.8h, v0.h[6]\n"
"fmla v31.8h, v1.8h, v0.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
"fmin v23.8h, v23.8h, v7.8h\n"
"fmax v16.8h, v16.8h, v8.8h\n"
"fmax v17.8h, v17.8h, v8.8h\n"
- "str q16, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v18.8h, v18.8h, v8.8h\n"
"fmax v19.8h, v19.8h, v8.8h\n"
- "str q17, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v20.8h, v20.8h, v8.8h\n"
"fmax v21.8h, v21.8h, v8.8h\n"
- "str q18, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"fmax v22.8h, v22.8h, v8.8h\n"
"fmax v23.8h, v23.8h, v8.8h\n"
- "str q19, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.8h, v24.8h, v7.8h\n"
"fmin v25.8h, v25.8h, v7.8h\n"
- "str q20, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.8h, v26.8h, v7.8h\n"
"fmin v27.8h, v27.8h, v7.8h\n"
- "str q21, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.8h, v28.8h, v7.8h\n"
"fmin v29.8h, v29.8h, v7.8h\n"
- "str q22, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.8h, v30.8h, v7.8h\n"
"fmin v31.8h, v31.8h, v7.8h\n"
- "str q23, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.8h, v24.8h, v8.8h\n"
"fmax v25.8h, v25.8h, v8.8h\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.8h, v26.8h, v8.8h\n"
"fmax v27.8h, v27.8h, v8.8h\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.8h, v28.8h, v8.8h\n"
"fmax v29.8h, v29.8h, v8.8h\n"
- "str q26, [x25, x28]\n"
"fmax v30.8h, v30.8h, v8.8h\n"
"fmax v31.8h, v31.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -354,80 +354,80 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"6:" // Output channel loop: Single kernel point
"fmla v16.8h, v6.8h, v1.h[0]\n"
"fmla v17.8h, v6.8h, v1.h[1]\n"
- "fmin v16.8h, v16.8h, v7.8h\n"
"lsl x28, x10, #0x1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
"fmla v18.8h, v6.8h, v1.h[2]\n"
"fmla v19.8h, v6.8h, v1.h[3]\n"
- "fmin v17.8h, v17.8h, v7.8h\n"
- "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
"fmla v20.8h, v6.8h, v1.h[4]\n"
"fmla v21.8h, v6.8h, v1.h[5]\n"
- "fmin v18.8h, v18.8h, v7.8h\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
"fmla v22.8h, v6.8h, v1.h[6]\n"
"fmla v23.8h, v6.8h, v1.h[7]\n"
- "fmin v19.8h, v19.8h, v7.8h\n"
- "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
"fmla v24.8h, v6.8h, v0.h[0]\n"
"fmla v25.8h, v6.8h, v0.h[1]\n"
- "fmin v20.8h, v20.8h, v7.8h\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
"fmla v26.8h, v6.8h, v0.h[2]\n"
"fmla v27.8h, v6.8h, v0.h[3]\n"
- "fmin v21.8h, v21.8h, v7.8h\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
"fmla v28.8h, v6.8h, v0.h[4]\n"
"fmla v29.8h, v6.8h, v0.h[5]\n"
- "fmin v22.8h, v22.8h, v7.8h\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
"fmla v30.8h, v6.8h, v0.h[6]\n"
"fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
"fmin v23.8h, v23.8h, v7.8h\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"fmax v16.8h, v16.8h, v8.8h\n"
"fmax v17.8h, v17.8h, v8.8h\n"
- "str q16, [x27, x28]\n"
"fmax v18.8h, v18.8h, v8.8h\n"
"fmax v19.8h, v19.8h, v8.8h\n"
- "str q17, [x26, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v20.8h, v20.8h, v8.8h\n"
"fmax v21.8h, v21.8h, v8.8h\n"
- "str q18, [x25, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v22.8h, v22.8h, v8.8h\n"
"fmax v23.8h, v23.8h, v8.8h\n"
- "str q19, [x24, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.8h, v24.8h, v7.8h\n"
"fmin v25.8h, v25.8h, v7.8h\n"
- "str q20, [x23, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.8h, v26.8h, v7.8h\n"
"fmin v27.8h, v27.8h, v7.8h\n"
- "str q21, [x22, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.8h, v28.8h, v7.8h\n"
"fmin v29.8h, v29.8h, v7.8h\n"
- "str q22, [x21, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.8h, v30.8h, v7.8h\n"
"fmin v31.8h, v31.8h, v7.8h\n"
- "str q23, [x20, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.8h, v24.8h, v8.8h\n"
"fmax v25.8h, v25.8h, v8.8h\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.8h, v26.8h, v8.8h\n"
"fmax v27.8h, v27.8h, v8.8h\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.8h, v28.8h, v8.8h\n"
"fmax v29.8h, v29.8h, v8.8h\n"
- "str q26, [x25, x28]\n"
"fmax v30.8h, v30.8h, v8.8h\n"
"fmax v31.8h, v31.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -466,20 +466,20 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"13:" // Output channel oddments: Load bias: Done
"ldr q6, [%x[weights], #0x0]\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
- "ldr q1, [x21, #0x0]\n"
- "ldr q0, [x20, #0x0]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
"mov v18.16b, v31.16b\n"
- "mov v19.16b, v31.16b\n"
"add %x[weights], %x[weights], #0x10\n"
+ "mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
+ "ldp x21, x20, [x22], #0x10\n"
"mov v21.16b, v31.16b\n"
"mov v22.16b, v31.16b\n"
"mov v23.16b, v31.16b\n"
"mov v24.16b, v31.16b\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
"mov v25.16b, v31.16b\n"
"mov v26.16b, v31.16b\n"
"mov v27.16b, v31.16b\n"
@@ -682,47 +682,47 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #2, 20f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.d }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.d }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.d }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.d }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.d }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.d }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.d }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.d }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x10, x10, #0x4\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.d }[0], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.d }[0], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x4\n"
"st1 { v26.d }[0], [x25]\n"
"st1 { v27.d }[0], [x24]\n"
"st1 { v28.d }[0], [x23]\n"
@@ -732,47 +732,47 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #1, 19f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.s }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.s }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.s }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.s }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.s }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.s }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.s }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.s }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x10, x10, #0x2\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.s }[2], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.s }[2], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x2\n"
"st1 { v26.s }[2], [x25]\n"
"st1 { v27.s }[2], [x24]\n"
"st1 { v28.s }[2], [x23]\n"
@@ -782,46 +782,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #0, 22f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.h }[6], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.h }[6], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.h }[6], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.h }[6], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.h }[6], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.h }[6], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.h }[6], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.h }[6], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.h }[6], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.h }[6], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v26.h }[6], [x25]\n"
"st1 { v27.h }[6], [x24]\n"
"st1 { v28.h }[6], [x23]\n"
@@ -833,46 +833,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #0, 22f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.h }[4], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.h }[4], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.h }[4], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.h }[4], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.h }[4], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.h }[4], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.h }[4], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.h }[4], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.h }[4], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.h }[4], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v26.h }[4], [x25]\n"
"st1 { v27.h }[4], [x24]\n"
"st1 { v28.h }[4], [x23]\n"
@@ -884,47 +884,47 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #1, 21f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.s }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.s }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.s }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.s }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.s }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.s }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.s }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.s }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
- "add x10, x10, #0x2\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.s }[0], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.s }[0], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x2\n"
"st1 { v26.s }[0], [x25]\n"
"st1 { v27.s }[0], [x24]\n"
"st1 { v28.s }[0], [x23]\n"
@@ -934,46 +934,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #0, 22f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.h }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.h }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.h }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.h }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.h }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.h }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.h }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.h }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.h }[2], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.h }[2], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v26.h }[2], [x25]\n"
"st1 { v27.h }[2], [x24]\n"
"st1 { v28.h }[2], [x23]\n"
@@ -984,46 +984,46 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #1\n"
- "add x26, x26, x10, LSL #1\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #1\n"
- "add x24, x24, x10, LSL #1\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #1\n"
- "add x22, x22, x10, LSL #1\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #1\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v16.h }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v17.h }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #1\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v18.h }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #1\n"
"st1 { v19.h }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #1\n"
+ "add x27, x27, x10, LSL #1\n"
"st1 { v20.h }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
"st1 { v21.h }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #1\n"
+ "add x25, x25, x10, LSL #1\n"
"st1 { v22.h }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
"st1 { v23.h }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #1\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
"st1 { v24.h }[0], [x27]\n"
+ "add x21, x21, x10, LSL #1\n"
"st1 { v25.h }[0], [x26]\n"
+ "add x20, x20, x10, LSL #1\n"
"st1 { v26.h }[0], [x25]\n"
"st1 { v27.h }[0], [x24]\n"
"st1 { v28.h }[0], [x23]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index fd8686c15e..1cd980a6b3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,144 +87,144 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x23, #0x0\n"
- "mov x22, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
"1:" // Tile loop
- "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x28, #0x2\n"
"mov x27, #0x2\n"
- "mov x26, #0x2\n"
- "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "str x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x16, #0x10\n" // cntb _, ALL, #1
"ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
"ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
- "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x15, x15, #0x2\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
- "lsr x22, %x[n_channels], #0x2\n"
- "add x11, x15, x15\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x9, x13, x25, LSL #2\n"
- "mul x20, x20, x26\n" // offset *= output_tile_size
- "add x28, x9, x25, LSL #2\n"
- "add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "lsr x24, %x[n_channels], #0x2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v27.4s }, [x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v26.4s }, [x20]\n"
- "add x27, x28, x25, LSL #2\n"
- "add x26, x11, x15\n"
- "add x25, x12, x24, LSL #2\n"
+ "mul x22, x10, x26\n" // offset = tile_i * ld_input_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x16\n"
+ "mul x20, x10, x25\n" // offset = tile_i * ld_output_row
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x9, x15, x22\n" // offset += tile_j * ld_input_col
+ "lsl x15, x15, #0x2\n"
+ "madd x20, x9, x14, x20\n" // offset += tile_j * ld_output_col
"lsl x14, x14, #0x2\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "ldr q3, [x10, #0x40]\n"
- "ldr q4, [x10, #0x50]\n"
- "ldr q5, [x10, #0x60]\n"
- "ldr q6, [x10, #0x70]\n"
- "ldr q7, [x10, #0x80]\n"
- "ldr q8, [x10, #0x90]\n"
- "add x10, x10, #0xa0\n"
- "ldr q9, [x9, x15]\n"
+ "mul x22, x22, x28\n" // offset *= kernel_stride * output_size
+ "add x10, x15, x15\n"
+ "add x9, x10, x15\n"
+ "mul x20, x20, x27\n" // offset *= output_tile_size
+ "add x13, x13, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x28, x13, x26, LSL #2\n"
+ "add x27, x28, x26, LSL #2\n"
+ "add x26, x27, x26, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x25, x12, x25, LSL #2\n"
+ "cbz x24, 4f\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q0, [x11, #0x10]\n"
+ "cmp x16, x24, LSL #4\n"
+ "ldr q1, [x11, #0x20]\n"
+ "ldr q2, [x11, #0x30]\n"
+ "ldr q3, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ "add x11, x11, #0xa0\n"
+ "ldr q9, [x28, x15]\n"
"ld1 { v10.4s }, [x13]\n"
- "ldr q11, [x13, x26]\n"
- "ldr q12, [x9, x11]\n"
- "ldr q13, [x28, x15]\n"
+ "ldr q11, [x13, x9]\n"
+ "ldr q12, [x28, x10]\n"
+ "ldr q13, [x27, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
"mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
- "add x23, x23, #0x10\n"
- "cmp x23, x22, LSL #4\n"
+ "add x16, x16, #0x10\n"
+ "add x21, x21, #0x10\n"
"mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ld1 { v18.4s }, [x27]\n"
- "ldr q25, [x10, #0x0]\n"
+ "ld1 { v18.4s }, [x26]\n"
+ "ldr q25, [x11, #0x0]\n"
+ "cmp x16, x24, LSL #4\n"
+ "add x23, x23, #0x10\n"
"fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q20, [x28, x11]\n"
+ "ldr q20, [x27, x10]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q17, [x27, x26]\n"
+ "ldr q17, [x26, x9]\n"
"fmla v22.4s, v2.4s, v12.4s\n"
"fmla v21.4s, v1.4s, v12.4s\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
"fmla v24.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"ldr q16, [x13, x15]\n"
"fmla v22.4s, v6.4s, v18.4s\n"
- "ldr q18, [x13, x11]\n"
- "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x13, x10]\n"
"add x13, x13, #0x10\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"fmla v24.4s, v7.4s, v13.4s\n"
"fmla v23.4s, v6.4s, v13.4s\n"
"fmla v22.4s, v4.4s, v13.4s\n"
"fmla v21.4s, v8.4s, v17.4s\n"
- "ld1 { v17.4s }, [x9]\n"
+ "ld1 { v17.4s }, [x28]\n"
"fmla v24.4s, v1.4s, v16.4s\n"
"fmla v23.4s, v0.4s, v16.4s\n"
- "ldr q16, [x9, x26]\n"
- "add x9, x9, #0x10\n"
+ "ldr q16, [x28, x9]\n"
+ "add x28, x28, #0x10\n"
"fmla v22.4s, v5.4s, v20.4s\n"
"fmla v21.4s, v4.4s, v20.4s\n"
- "ldr q4, [x10, #0x50]\n"
+ "ldr q4, [x11, #0x50]\n"
"fmla v24.4s, v2.4s, v18.4s\n"
"fmla v23.4s, v1.4s, v18.4s\n"
- "ld1 { v19.4s }, [x28]\n"
- "ldr q1, [x10, #0x20]\n"
+ "ld1 { v19.4s }, [x27]\n"
+ "ldr q1, [x11, #0x20]\n"
"fmla v22.4s, v0.4s, v17.4s\n"
- "ldr q0, [x10, #0x10]\n"
+ "ldr q0, [x11, #0x10]\n"
"fmla v21.4s, v2.4s, v16.4s\n"
- "ldr q2, [x10, #0x30]\n"
+ "ldr q2, [x11, #0x30]\n"
"fmla v24.4s, v8.4s, v20.4s\n"
"fmla v23.4s, v7.4s, v20.4s\n"
- "ldr q18, [x28, x26]\n"
- "add x28, x28, #0x10\n"
- "ldr q13, [x28, x15]\n"
+ "ldr q18, [x27, x9]\n"
+ "add x27, x27, #0x10\n"
+ "ldr q13, [x27, x15]\n"
"fmla v22.4s, v3.4s, v19.4s\n"
"fmla v21.4s, v5.4s, v18.4s\n"
"fmla v24.4s, v3.4s, v17.4s\n"
- "ldr q17, [x27, x15]\n"
- "ldr q3, [x10, #0x40]\n"
+ "ldr q17, [x26, x15]\n"
+ "ldr q3, [x11, #0x40]\n"
"fmla v23.4s, v5.4s, v16.4s\n"
- "ldr q16, [x27, x11]\n"
- "ldr q5, [x10, #0x60]\n"
+ "ldr q16, [x26, x10]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "add x26, x26, #0x10\n"
"fmla v22.4s, v7.4s, v17.4s\n"
"fmla v21.4s, v6.4s, v17.4s\n"
- "ldr q11, [x13, x26]\n"
+ "ldr q11, [x13, x9]\n"
"fmla v24.4s, v6.4s, v19.4s\n"
- "ldr q9, [x9, x15]\n"
+ "ldr q9, [x28, x15]\n"
+ "ldr q6, [x11, #0x70]\n"
"fmla v23.4s, v8.4s, v18.4s\n"
"ld1 { v10.4s }, [x13]\n"
- "ldr q6, [x10, #0x70]\n"
"fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x11, #0x90]\n"
"fmla v21.4s, v7.4s, v16.4s\n"
- "ldr q12, [x9, x11]\n"
- "ldr q7, [x10, #0x80]\n"
+ "ldr q12, [x28, x10]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "add x11, x11, #0xa0\n"
"fmax v24.4s, v24.4s, v27.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
- "ldr q8, [x10, #0x90]\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "add x27, x27, #0x10\n"
"fmin v24.4s, v24.4s, v26.4s\n"
"fmin v23.4s, v23.4s, v26.4s\n"
- "st1 { v24.4s }, [x12]\n"
- "add x10, x10, #0xa0\n"
"fmin v22.4s, v22.4s, v26.4s\n"
"fmin v21.4s, v21.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
"str q23, [x12, x14]\n"
"add x12, x12, #0x10\n"
"st1 { v22.4s }, [x25]\n"
@@ -236,58 +236,58 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
"mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ld1 { v18.4s }, [x27]\n"
+ "ld1 { v18.4s }, [x26]\n"
"fmla v24.4s, v0.4s, v10.4s\n"
- "ldr q20, [x28, x11]\n"
+ "ldr q20, [x27, x10]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q17, [x27, x26]\n"
+ "ldr q17, [x26, x9]\n"
"fmla v22.4s, v2.4s, v12.4s\n"
"fmla v21.4s, v1.4s, v12.4s\n"
"fmla v24.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"ldr q16, [x13, x15]\n"
"fmla v22.4s, v6.4s, v18.4s\n"
- "ldr q18, [x13, x11]\n"
- "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x13, x10]\n"
"add x13, x13, #0x10\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
"fmla v24.4s, v7.4s, v13.4s\n"
"fmla v23.4s, v6.4s, v13.4s\n"
"fmla v22.4s, v4.4s, v13.4s\n"
"fmla v21.4s, v8.4s, v17.4s\n"
- "ld1 { v17.4s }, [x9]\n"
+ "ld1 { v17.4s }, [x28]\n"
"fmla v24.4s, v1.4s, v16.4s\n"
"fmla v23.4s, v0.4s, v16.4s\n"
- "ldr q16, [x9, x26]\n"
- "add x9, x9, #0x10\n"
+ "ldr q16, [x28, x9]\n"
+ "add x28, x28, #0x10\n"
"fmla v22.4s, v5.4s, v20.4s\n"
"fmla v21.4s, v4.4s, v20.4s\n"
"fmla v24.4s, v2.4s, v18.4s\n"
"fmla v23.4s, v1.4s, v18.4s\n"
- "ld1 { v19.4s }, [x28]\n"
+ "ld1 { v19.4s }, [x27]\n"
"fmla v22.4s, v0.4s, v17.4s\n"
"fmla v21.4s, v2.4s, v16.4s\n"
"fmla v24.4s, v8.4s, v20.4s\n"
"fmla v23.4s, v7.4s, v20.4s\n"
- "ldr q18, [x28, x26]\n"
- "add x28, x28, #0x10\n"
+ "ldr q18, [x27, x9]\n"
+ "add x27, x27, #0x10\n"
"fmla v22.4s, v3.4s, v19.4s\n"
"fmla v21.4s, v5.4s, v18.4s\n"
"fmla v24.4s, v3.4s, v17.4s\n"
- "ldr q17, [x27, x15]\n"
+ "ldr q17, [x26, x15]\n"
"fmla v23.4s, v5.4s, v16.4s\n"
- "ldr q16, [x27, x11]\n"
+ "ldr q16, [x26, x10]\n"
+ "add x26, x26, #0x10\n"
"fmla v22.4s, v7.4s, v17.4s\n"
"fmla v21.4s, v6.4s, v17.4s\n"
- "add x27, x27, #0x10\n"
"fmla v24.4s, v6.4s, v19.4s\n"
"fmla v23.4s, v8.4s, v18.4s\n"
- "fmax v24.4s, v24.4s, v27.4s\n"
"fmla v22.4s, v8.4s, v16.4s\n"
"fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "fmin v24.4s, v24.4s, v26.4s\n"
"fmin v23.4s, v23.4s, v26.4s\n"
"st1 { v24.4s }, [x12]\n"
"fmin v22.4s, v22.4s, v26.4s\n"
@@ -300,21 +300,21 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 31f\n"
- "ldr q25, [x10, #0x0]\n"
- "ldr q0, [x10, #0x10]\n"
- "add x24, x9, x15\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q0, [x11, #0x10]\n"
+ "add x24, x28, x15\n"
"add x23, x13, XZR\n"
- "ldr q1, [x10, #0x20]\n"
- "ldr q2, [x10, #0x30]\n"
- "add x22, x13, x26\n"
- "add x21, x9, x11\n"
- "ldr q3, [x10, #0x40]\n"
- "ldr q4, [x10, #0x50]\n"
- "add x20, x28, x15\n"
- "ldr q5, [x10, #0x60]\n"
- "ldr q6, [x10, #0x70]\n"
- "ldr q7, [x10, #0x80]\n"
- "ldr q8, [x10, #0x90]\n"
+ "ldr q1, [x11, #0x20]\n"
+ "ldr q2, [x11, #0x30]\n"
+ "add x22, x13, x9\n"
+ "add x21, x28, x10\n"
+ "ldr q3, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ "add x20, x27, x15\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
"ldr d9, [x24], #0x8\n"
"ldr d10, [x23], #0x8\n"
@@ -337,15 +337,15 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
"mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
"mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "add x20, x27, XZR\n"
+ "add x20, x26, XZR\n"
"mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -356,10 +356,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v30.4s, v6.4s, v9.4s\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x27, x26\n"
+ "add x20, x26, x9\n"
"fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
@@ -380,7 +380,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
"fmla v28.4s, v1.4s, v12.4s\n"
"fmla v29.4s, v0.4s, v12.4s\n"
- "add x20, x13, x11\n"
+ "add x20, x13, x10\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -391,7 +391,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
"fmla v28.4s, v2.4s, v9.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "add x20, x28, x11\n"
+ "add x20, x27, x10\n"
"tbz %x[n_channels], #1, 15f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
@@ -402,7 +402,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x9, XZR\n"
+ "add x20, x28, XZR\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
"tbz %x[n_channels], #1, 17f\n"
@@ -415,7 +415,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
"fmla v28.4s, v3.4s, v11.4s\n"
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x20, x9, x26\n"
+ "add x20, x28, x9\n"
"tbz %x[n_channels], #1, 19f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
@@ -426,7 +426,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v29.4s, v5.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v12.4s\n"
- "add x20, x28, XZR\n"
+ "add x20, x27, XZR\n"
"tbz %x[n_channels], #1, 21f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
@@ -437,7 +437,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
"fmla v28.4s, v6.4s, v9.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x20, x28, x26\n"
+ "add x20, x27, x9\n"
"tbz %x[n_channels], #1, 23f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
@@ -448,7 +448,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
"fmla v29.4s, v8.4s, v10.4s\n"
"fmla v31.4s, v5.4s, v10.4s\n"
- "add x20, x27, x15\n"
+ "add x20, x26, x15\n"
"tbz %x[n_channels], #1, 25f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
@@ -459,7 +459,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
"fmla v30.4s, v7.4s, v11.4s\n"
"fmla v31.4s, v6.4s, v11.4s\n"
- "add x20, x27, x11\n"
+ "add x20, x26, x10\n"
"tbz %x[n_channels], #1, 27f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
@@ -472,19 +472,19 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmla v31.4s, v7.4s, v12.4s\n"
"fmax v28.4s, v28.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
"fmax v31.4s, v31.4s, v27.4s\n"
- "fmin v28.4s, v28.4s, v26.4s\n"
"fmin v29.4s, v29.4s, v26.4s\n"
"fmin v30.4s, v30.4s, v26.4s\n"
"fmin v31.4s, v31.4s, v26.4s\n"
"tbz %x[n_channels], #1, 29f\n"
"mov x21, x12\n"
"mov x20, x25\n"
- "st1 { v28.d }[0], [x21], x14\n"
- "st1 { v30.d }[0], [x20], x14\n"
"add x12, x12, #0x8\n"
"add x25, x25, #0x8\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 30f\n"
@@ -504,20 +504,20 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.s }[0], [x20]\n"
"30:" // Tile loop: Oddments: Store: Bit 1: End
"31:" // Tile loop: End
- "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x22, x22, #0x1\n"
- "add x21, x23, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x22, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x23, x23, x21, LT\n"
- "csel x22, x22, XZR, LT\n"
- "cmp x23, x20\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x9, x9, #0x1\n"
+ "add x20, x10, #0x1\n"
+ "cmp x9, x22\n"
+ "csel x10, x10, x20, LT\n"
+ "csel x9, x9, XZR, LT\n"
+ "cmp x10, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 7dedfd972a..abe586725b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -78,237 +78,237 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x16, #0x10\n" // cntb _, ALL, #1
- "lsr x15, %x[n_channels], #0x2\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v27.4s }, [x20]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x16, %x[n_channels], #0x2\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.4s }, [x21]\n"
"ld1r { v26.4s }, [x20]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x12, x11, [x21, #0x0]\n"
- "ldp x10, x9, [x21, #0x10]\n"
- "mov x28, #0x0\n"
- "sub x27, XZR, x16\n"
- "cbz x15, 3f\n"
- "ldr q25, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "cmp x16, x15, LSL #4\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "add x14, x14, #0xa0\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x28]\n"
- "ldr q10, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "ldr q11, [x21, x28]\n"
- "ldr q12, [x20, x28]\n"
- "ldr x20, [x13, #0x20]\n"
- "ldr q13, [x20, x28]\n"
+ "add x14, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x22, #0x0]\n"
+ "ldp x10, x9, [x22, #0x10]\n"
+ "sub x28, XZR, x17\n"
+ "cbz x16, 3f\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x17, x16, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x24, x23, [x14, #0x0]\n"
+ "ldp x22, x21, [x14, #0x10]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "ldr q9, [x24, x13]\n"
+ "ldr q10, [x23, x13]\n"
+ "ldr q11, [x22, x13]\n"
+ "ldr q12, [x21, x13]\n"
+ "ldr q13, [x20, x13]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
"mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "ldr x21, [x14, #0x30]\n"
"mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q18, [x21, x28]\n"
- "ldr q25, [x14, #0x0]\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr x24, [x14, #0x38]\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q18, [x22, x13]\n"
+ "ldr x22, [x14, #0x50]\n"
"fmla v24.4s, v0.4s, v10.4s\n"
"fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q17, [x20, x28]\n"
- "ldr x21, [x13, #0x38]\n"
+ "ldr q17, [x21, x13]\n"
+ "ldr x21, [x14, #0x58]\n"
+ "ldr q20, [x20, x13]\n"
"fmla v22.4s, v2.4s, v12.4s\n"
"fmla v21.4s, v1.4s, v12.4s\n"
- "ldr x20, [x13, #0x48]\n"
- "ldr q20, [x20, x28]\n"
+ "ldr x20, [x14, #0x60]\n"
+ "ldr x27, [x14, #0x68]\n"
+ "ldr x26, [x14, #0x70]\n"
"fmla v24.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x20, [x13, #0x40]\n"
+ "ldr q16, [x24, x13]\n"
+ "ldr x25, [x14, #0x78]\n"
"fmla v22.4s, v6.4s, v18.4s\n"
- "ldr q18, [x20, x28]\n"
+ "ldr q18, [x23, x13]\n"
+ "ldp x24, x23, [x14, #0x0]\n"
"fmla v21.4s, v3.4s, v13.4s\n"
- "ldr x20, [x13, #0x50]\n"
"fmla v24.4s, v7.4s, v13.4s\n"
"fmla v23.4s, v6.4s, v13.4s\n"
- "ldr x22, [x13, #0x58]\n"
- "ldr x21, [x13, #0x60]\n"
"fmla v22.4s, v4.4s, v13.4s\n"
"fmla v21.4s, v8.4s, v17.4s\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x68]\n"
+ "ldr q17, [x22, x13]\n"
"fmla v24.4s, v1.4s, v16.4s\n"
"fmla v23.4s, v0.4s, v16.4s\n"
- "ldr q16, [x22, x28]\n"
- "ldr x26, [x13, #0x70]\n"
+ "ldr q16, [x21, x13]\n"
+ "ldp x22, x21, [x14, #0x10]\n"
"fmla v22.4s, v5.4s, v20.4s\n"
"fmla v21.4s, v4.4s, v20.4s\n"
- "ldr q4, [x14, #0x50]\n"
- "ldr x25, [x13, #0x78]\n"
+ "ldr q4, [x15, #0x50]\n"
"fmla v24.4s, v2.4s, v18.4s\n"
"fmla v23.4s, v1.4s, v18.4s\n"
- "ldr q19, [x21, x28]\n"
- "ldr q1, [x14, #0x20]\n"
+ "ldr q19, [x20, x13]\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr x20, [x14, #0x20]\n"
"fmla v22.4s, v0.4s, v17.4s\n"
- "ldr q0, [x14, #0x10]\n"
+ "ldr q0, [x15, #0x10]\n"
"fmla v21.4s, v2.4s, v16.4s\n"
- "ldr q2, [x14, #0x30]\n"
+ "ldr q2, [x15, #0x30]\n"
"fmla v24.4s, v8.4s, v20.4s\n"
+ "ldr q13, [x20, x17]\n"
"fmla v23.4s, v7.4s, v20.4s\n"
- "ldr q18, [x20, x28]\n"
- "ldp x24, x23, [x13, #0x0]\n"
+ "ldr q18, [x27, x13]\n"
"fmla v22.4s, v3.4s, v19.4s\n"
"fmla v21.4s, v5.4s, v18.4s\n"
- "ldp x22, x21, [x13, #0x10]\n"
- "ldr x20, [x13, #0x20]\n"
- "ldr q13, [x20, x16]\n"
"fmla v24.4s, v3.4s, v17.4s\n"
- "ldr q17, [x26, x28]\n"
+ "ldr q17, [x26, x13]\n"
+ "ldr q3, [x15, #0x40]\n"
"fmla v23.4s, v5.4s, v16.4s\n"
- "ldr q16, [x25, x28]\n"
- "ldr q3, [x14, #0x40]\n"
+ "ldr q16, [x25, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "add x13, x13, #0x10\n"
"fmla v22.4s, v7.4s, v17.4s\n"
"fmla v21.4s, v6.4s, v17.4s\n"
- "ldr q11, [x22, x16]\n"
- "ldr q5, [x14, #0x60]\n"
+ "ldr q11, [x22, x17]\n"
"fmla v24.4s, v6.4s, v19.4s\n"
+ "ldr q9, [x24, x17]\n"
+ "ldr q6, [x15, #0x70]\n"
"fmla v23.4s, v8.4s, v18.4s\n"
- "ldr q9, [x24, x16]\n"
- "ldr q10, [x23, x16]\n"
+ "ldr q10, [x23, x17]\n"
"fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x15, #0x90]\n"
"fmla v21.4s, v7.4s, v16.4s\n"
- "ldr q12, [x21, x16]\n"
- "ldr q6, [x14, #0x70]\n"
+ "ldr q12, [x21, x17]\n"
+ "add x17, x17, #0x10\n"
+ "ldr q7, [x15, #0x80]\n"
+ "cmp x17, x16, LSL #4\n"
+ "add x15, x15, #0xa0\n"
"fmax v24.4s, v24.4s, v27.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "add x16, x16, #0x10\n"
- "add x27, x27, #0x10\n"
"fmin v24.4s, v24.4s, v26.4s\n"
"fmin v23.4s, v23.4s, v26.4s\n"
- "cmp x16, x15, LSL #4\n"
"fmin v22.4s, v22.4s, v26.4s\n"
"fmin v21.4s, v21.4s, v26.4s\n"
- "add x28, x28, #0x10\n"
- "str q24, [x12, x27]\n"
- "add x14, x14, #0xa0\n"
- "str q23, [x11, x27]\n"
- "str q22, [x10, x27]\n"
- "str q21, [x9, x27]\n"
+ "str q24, [x12, x28]\n"
+ "str q23, [x11, x28]\n"
+ "str q22, [x10, x28]\n"
+ "str q21, [x9, x28]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
"mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
- "ldr x21, [x13, #0x28]\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "ldr x21, [x14, #0x30]\n"
"mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
"mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "ldr q18, [x21, x28]\n"
- "ldr x21, [x13, #0x38]\n"
+ "ldr x27, [x14, #0x38]\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x26, [x14, #0x40]\n"
+ "ldr x25, [x14, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q18, [x22, x13]\n"
+ "ldr x24, [x14, #0x58]\n"
"fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x20, x13]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x48]\n"
- "ldr q20, [x20, x28]\n"
+ "ldr q17, [x21, x13]\n"
"fmla v22.4s, v2.4s, v12.4s\n"
"fmla v21.4s, v1.4s, v12.4s\n"
- "ldr x20, [x13, #0x40]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "ldr x21, [x14, #0x70]\n"
"fmla v24.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x21, [x13, #0x50]\n"
+ "ldr q16, [x27, x13]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v22.4s, v6.4s, v18.4s\n"
- "ldr q18, [x20, x28]\n"
+ "ldr q18, [x26, x13]\n"
"fmla v21.4s, v3.4s, v13.4s\n"
- "ldr x20, [x13, #0x58]\n"
"fmla v24.4s, v7.4s, v13.4s\n"
"fmla v23.4s, v6.4s, v13.4s\n"
- "ldr x23, [x13, #0x60]\n"
- "ldr x22, [x13, #0x68]\n"
"fmla v22.4s, v4.4s, v13.4s\n"
"fmla v21.4s, v8.4s, v17.4s\n"
- "ldr q17, [x21, x28]\n"
- "ldr x21, [x13, #0x70]\n"
+ "ldr q17, [x25, x13]\n"
"fmla v24.4s, v1.4s, v16.4s\n"
"fmla v23.4s, v0.4s, v16.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0x78]\n"
+ "ldr q16, [x24, x13]\n"
"fmla v22.4s, v5.4s, v20.4s\n"
"fmla v21.4s, v4.4s, v20.4s\n"
- "add x27, x27, #0x10\n"
"fmla v24.4s, v2.4s, v18.4s\n"
"fmla v23.4s, v1.4s, v18.4s\n"
- "ldr q19, [x23, x28]\n"
+ "ldr q19, [x23, x13]\n"
"fmla v22.4s, v0.4s, v17.4s\n"
"fmla v21.4s, v2.4s, v16.4s\n"
"fmla v24.4s, v8.4s, v20.4s\n"
"fmla v23.4s, v7.4s, v20.4s\n"
- "ldr q18, [x22, x28]\n"
+ "ldr q18, [x22, x13]\n"
"fmla v22.4s, v3.4s, v19.4s\n"
"fmla v21.4s, v5.4s, v18.4s\n"
"fmla v24.4s, v3.4s, v17.4s\n"
- "ldr q17, [x21, x28]\n"
+ "ldr q17, [x21, x13]\n"
"fmla v23.4s, v5.4s, v16.4s\n"
- "ldr q16, [x20, x28]\n"
+ "ldr q16, [x20, x13]\n"
+ "add x13, x13, #0x10\n"
"fmla v22.4s, v7.4s, v17.4s\n"
"fmla v21.4s, v6.4s, v17.4s\n"
- "add x28, x28, #0x10\n"
"fmla v24.4s, v6.4s, v19.4s\n"
"fmla v23.4s, v8.4s, v18.4s\n"
- "fmax v24.4s, v24.4s, v27.4s\n"
"fmla v22.4s, v8.4s, v16.4s\n"
"fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
"fmax v23.4s, v23.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "fmin v24.4s, v24.4s, v26.4s\n"
"fmin v23.4s, v23.4s, v26.4s\n"
- "str q24, [x12, x27]\n"
+ "str q24, [x12, x28]\n"
"fmin v22.4s, v22.4s, v26.4s\n"
"fmin v21.4s, v21.4s, v26.4s\n"
- "str q23, [x11, x27]\n"
- "str q22, [x10, x27]\n"
- "str q21, [x9, x27]\n"
+ "str q23, [x11, x28]\n"
+ "str q22, [x10, x28]\n"
+ "str q21, [x9, x28]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 30f\n"
- "ldr q25, [x14, #0x0]\n"
- "ldr q0, [x14, #0x10]\n"
- "mov x20, x28\n"
+ "ldr q25, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x20, x13\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
"add x12, x12, x20\n"
- "ldr q1, [x14, #0x20]\n"
- "ldr q2, [x14, #0x30]\n"
"add x11, x11, x20\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
"add x10, x10, x20\n"
- "ldr q3, [x14, #0x40]\n"
- "ldr q4, [x14, #0x50]\n"
"add x9, x9, x20\n"
- "ldr q5, [x14, #0x60]\n"
- "ldr q6, [x14, #0x70]\n"
- "ldr q7, [x14, #0x80]\n"
- "ldr q8, [x14, #0x90]\n"
- "ldr x24, [x13, #0x0]\n"
- "ldr x23, [x13, #0x8]\n"
- "add x24, x24, x28\n"
- "add x23, x23, x28\n"
- "ldr x22, [x13, #0x10]\n"
- "ldr x21, [x13, #0x18]\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "ldr x20, [x13, #0x20]\n"
- "add x20, x20, x28\n"
+ "ldr x24, [x14, #0x0]\n"
+ "ldr x23, [x14, #0x8]\n"
+ "ldr x22, [x14, #0x10]\n"
+ "ldr x21, [x14, #0x18]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
"ld1 { v10.d }[0], [x23], #0x8\n"
@@ -331,16 +331,16 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
"mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
"mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
- "ldr x20, [x13, #0x28]\n"
- "add x20, x20, x28\n"
+ "ldr x20, [x14, #0x28]\n"
"mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "add x20, x20, x13\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v12.4s\n"
- "fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
"tbz %x[n_channels], #1, 6f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
@@ -350,12 +350,12 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v9.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (3, 0): Bit 1: End
"fmla v30.4s, v6.4s, v9.4s\n"
- "ldr x20, [x13, #0x30]\n"
+ "ldr x20, [x14, #0x30]\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x20, x28\n"
"fmla v29.4s, v6.4s, v13.4s\n"
- "fmla v30.4s, v4.4s, v13.4s\n"
"fmla v31.4s, v3.4s, v13.4s\n"
+ "add x20, x20, x13\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -364,9 +364,9 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x13, #0x38]\n"
+ "ldr x20, [x14, #0x38]\n"
"fmla v31.4s, v8.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -375,10 +375,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (0, 1): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (0, 1): Bit 1: End
- "ldr x20, [x13, #0x40]\n"
+ "ldr x20, [x14, #0x40]\n"
"fmla v28.4s, v1.4s, v12.4s\n"
"fmla v29.4s, v0.4s, v12.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -387,10 +387,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (0, 2): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (0, 2): Bit 1: End
- "ldr x20, [x13, #0x48]\n"
+ "ldr x20, [x14, #0x48]\n"
"fmla v28.4s, v2.4s, v9.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -399,12 +399,12 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 2): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 2): Bit 1: End
- "ldr x20, [x13, #0x50]\n"
+ "ldr x20, [x14, #0x50]\n"
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x20, x28\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -413,10 +413,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (1, 0): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (1, 0): Bit 1: End
- "ldr x20, [x13, #0x58]\n"
+ "ldr x20, [x14, #0x58]\n"
"fmla v28.4s, v3.4s, v11.4s\n"
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -425,10 +425,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x13, #0x60]\n"
+ "ldr x20, [x14, #0x60]\n"
"fmla v29.4s, v5.4s, v12.4s\n"
"fmla v31.4s, v2.4s, v12.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -437,10 +437,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v9.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x13, #0x68]\n"
+ "ldr x20, [x14, #0x68]\n"
"fmla v28.4s, v6.4s, v9.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -449,10 +449,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (2, 3): Bit 1: Unset
"ld1 { v10.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (2, 3): Bit 1: End
- "ldr x20, [x13, #0x70]\n"
+ "ldr x20, [x14, #0x70]\n"
"fmla v29.4s, v8.4s, v10.4s\n"
"fmla v31.4s, v5.4s, v10.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -461,10 +461,10 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x13, #0x78]\n"
+ "ldr x20, [x14, #0x78]\n"
"fmla v30.4s, v7.4s, v11.4s\n"
"fmla v31.4s, v6.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x13\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -477,9 +477,9 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v31.4s, v7.4s, v12.4s\n"
"fmax v28.4s, v28.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
"fmax v31.4s, v31.4s, v27.4s\n"
- "fmin v28.4s, v28.4s, v26.4s\n"
"fmin v29.4s, v29.4s, v26.4s\n"
"fmin v30.4s, v30.4s, v26.4s\n"
"fmin v31.4s, v31.4s, v26.4s\n"
@@ -503,7 +503,7 @@ void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"30:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 9bfcd9cd3c..8a7542d3aa 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,52 +87,52 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x24, #0x0\n"
- "mov x23, #0x0\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
"1:" // Tile loop
- "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x28, #0x3\n"
"mov x27, #0x3\n"
- "mov x26, #0x3\n"
- "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
- "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
- "mov x24, #0x10\n" // cntb _, ALL, #1
- "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
- "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x8, x8, #0x2\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
- "lsl x17, x17, #0x2\n"
- "lsr x23, %x[n_channels], #0x2\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
- "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x13, x16, x25, LSL #2\n"
- "mul x20, x20, x26\n" // offset *= output_tile_size
- "add x12, x13, x25, LSL #2\n"
- "add x11, x8, x8\n"
- "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "add x10, x12, x25, LSL #2\n"
- "add x9, x11, x8\n"
- "add x28, x15, x22, LSL #2\n"
+ "str x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsr x17, %x[n_channels], #0x2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v15.4s }, [x20]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x24, #0x0\n"
"ld1r { v14.4s }, [x20]\n"
- "add x27, x10, x25, LSL #2\n"
- "add x26, x9, x8\n"
- "add x25, x28, x22, LSL #2\n"
- "add x22, x17, x17\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x24\n"
- "cbz x23, 4f\n"
+ "mul x23, x10, x26\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x22, XZR, x6\n"
+ "mul x21, x10, x25\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x23, x9, x7, x23\n" // offset += tile_j * ld_input_col
+ "lsl x7, x7, #0x2\n"
+ "madd x21, x9, x8, x21\n" // offset += tile_j * ld_output_col
+ "lsl x8, x8, #0x2\n"
+ "mul x23, x23, x28\n" // offset *= kernel_stride * output_size
+ "add x13, x7, x7\n"
+ "add x12, x13, x7\n"
+ "add x11, x12, x7\n"
+ "mul x21, x21, x27\n" // offset *= output_tile_size
+ "add x20, x8, x8\n"
+ "add x16, x16, x23, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x10, x16, x26, LSL #2\n"
+ "add x9, x10, x26, LSL #2\n"
+ "add x15, x15, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x28, x9, x26, LSL #2\n"
+ "add x27, x15, x25, LSL #2\n"
+ "add x26, x28, x26, LSL #2\n"
+ "add x25, x27, x25, LSL #2\n"
+ "cbz x17, 4f\n"
"ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "cmp x24, x23, LSL #4\n"
+ "cmp x6, x17, LSL #4\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
"ldr q3, [x14, #0x40]\n"
@@ -142,321 +142,321 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr q7, [x14, #0x80]\n"
"ldr q8, [x14, #0x90]\n"
"add x14, x14, #0xa0\n"
- "ldr q9, [x12, x11]\n"
+ "ldr q9, [x9, x13]\n"
"ld1 { v10.4s }, [x16]\n"
- "ldr q11, [x16, x26]\n"
- "ld1 { v12.4s }, [x27]\n"
- "ldr q13, [x13, x11]\n"
+ "ldr q11, [x16, x11]\n"
+ "ld1 { v12.4s }, [x26]\n"
+ "ldr q13, [x10, x13]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
- "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "add x6, x6, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "cmp x6, x17, LSL #4\n"
"add x24, x24, #0x10\n"
- "cmp x24, x23, LSL #4\n"
- "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
- "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
- "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q23, [x12, x9]\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q18, [x12, x8]\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v6.4s, v18.4s\n"
- "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v27.4s, v3.4s, v13.4s\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v25.4s, v1.4s, v13.4s\n"
- "fmla v24.4s, v0.4s, v13.4s\n"
- "ldr q17, [x16, x8]\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "ldr q16, [x27, x26]\n"
- "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x9, x12]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x9, x7]\n"
+ "fmla v27.4s, v2.4s, v13.4s\n"
+ "fmla v26.4s, v1.4s, v13.4s\n"
+ "fmla v25.4s, v0.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x26, x11]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
"ldr q31, [x14, #0x0]\n"
- "fmla v29.4s, v0.4s, v17.4s\n"
- "fmla v21.4s, v8.4s, v16.4s\n"
- "ldr q16, [x16, x9]\n"
- "fmla v28.4s, v7.4s, v18.4s\n"
- "fmla v20.4s, v0.4s, v18.4s\n"
- "fmla v26.4s, v4.4s, v18.4s\n"
- "fmla v25.4s, v3.4s, v18.4s\n"
- "fmla v22.4s, v1.4s, v18.4s\n"
- "ld1 { v19.4s }, [x13]\n"
- "fmla v29.4s, v2.4s, v16.4s\n"
- "fmla v27.4s, v1.4s, v16.4s\n"
- "ld1 { v18.4s }, [x10]\n"
- "fmla v24.4s, v4.4s, v23.4s\n"
- "fmla v28.4s, v1.4s, v17.4s\n"
- "ldr q16, [x13, x26]\n"
- "fmla v20.4s, v2.4s, v23.4s\n"
- "fmla v21.4s, v1.4s, v23.4s\n"
- "fmla v29.4s, v8.4s, v23.4s\n"
- "fmla v27.4s, v7.4s, v23.4s\n"
- "fmla v25.4s, v5.4s, v23.4s\n"
- "ldr q17, [x10, x11]\n"
+ "fmla v30.4s, v6.4s, v17.4s\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x16, x7]\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x12]\n"
+ "fmla v26.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "fmla v30.4s, v0.4s, v18.4s\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ld1 { v20.4s }, [x10]\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v26.4s, v5.4s, v22.4s\n"
+ "fmla v21.4s, v2.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v20.4s\n"
+ "fmla v30.4s, v2.4s, v16.4s\n"
+ "ld1 { v17.4s }, [x28]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q16, [x10, x11]\n"
+ "fmla v28.4s, v7.4s, v22.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v6.4s, v17.4s\n"
+ "ldr q19, [x10, x7]\n"
+ "fmla v30.4s, v8.4s, v22.4s\n"
+ "ldr q18, [x28, x13]\n"
+ "fmla v29.4s, v3.4s, v20.4s\n"
+ "ldr q17, [x28, x11]\n"
+ "fmla v28.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x26, x7]\n"
+ "fmla v21.4s, v4.4s, v18.4s\n"
+ "fmla v23.4s, v3.4s, v18.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v24.4s, v5.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v27.4s, v8.4s, v18.4s\n"
+ "fmla v30.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v29.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v5.4s, v17.4s\n"
"fmla v26.4s, v0.4s, v19.4s\n"
- "fmla v22.4s, v3.4s, v18.4s\n"
- "fmla v24.4s, v2.4s, v16.4s\n"
- "fmla v20.4s, v4.4s, v17.4s\n"
- "fmla v21.4s, v3.4s, v17.4s\n"
- "fmla v28.4s, v3.4s, v19.4s\n"
- "ldr q19, [x10, x26]\n"
- "fmla v27.4s, v5.4s, v16.4s\n"
- "ldr q16, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v18.4s\n"
- "ldr q18, [x13, x8]\n"
- "fmla v25.4s, v7.4s, v17.4s\n"
- "fmla v22.4s, v5.4s, v17.4s\n"
- "fmla v24.4s, v6.4s, v17.4s\n"
- "fmla v21.4s, v5.4s, v19.4s\n"
- "fmla v20.4s, v6.4s, v16.4s\n"
- "fmla v26.4s, v8.4s, v17.4s\n"
- "fmla v22.4s, v7.4s, v16.4s\n"
- "ldr q17, [x27, x9]\n"
- "fmla v29.4s, v3.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v18.4s\n"
- "fmla v24.4s, v8.4s, v19.4s\n"
- "ldr q16, [x13, x9]\n"
- "fmla v20.4s, v8.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
- "fmla v21.4s, v7.4s, v17.4s\n"
- "ldr q19, [x10, x9]\n"
- "fmla v28.4s, v4.4s, v18.4s\n"
- "fmla v26.4s, v1.4s, v18.4s\n"
- "ldr q17, [x10, x8]\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "ldr q18, [x26, x12]\n"
+ "fmla v25.4s, v8.4s, v17.4s\n"
+ "ldr q16, [x10, x12]\n"
+ "fmla v27.4s, v1.4s, v19.4s\n"
+ "ldr q17, [x28, x7]\n"
"add x10, x10, #0x10\n"
- "fmla v27.4s, v4.4s, v16.4s\n"
- "fmla v25.4s, v2.4s, v16.4s\n"
- "fmla v24.4s, v1.4s, v16.4s\n"
- "ldr q16, [x16, x11]\n"
- "fmla v22.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v18.4s\n"
+ "fmla v23.4s, v7.4s, v18.4s\n"
+ "ldr q19, [x28, x12]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v2.4s, v16.4s\n"
+ "fmla v25.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x13]\n"
+ "fmla v24.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
"ld1 { v10.4s }, [x16]\n"
- "fmla v20.4s, v3.4s, v17.4s\n"
- "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v27.4s, v7.4s, v17.4s\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
"ldr q4, [x14, #0x50]\n"
- "fmla v26.4s, v7.4s, v17.4s\n"
- "fmla v25.4s, v6.4s, v17.4s\n"
- "ld1 { v18.4s }, [x12]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmla v26.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x9]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
"ldr q1, [x14, #0x20]\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmla v27.4s, v0.4s, v16.4s\n"
- "ldr q17, [x12, x26]\n"
- "fmla v24.4s, v7.4s, v19.4s\n"
- "add x12, x12, #0x10\n"
- "ldr q9, [x12, x11]\n"
- "fmla v20.4s, v5.4s, v19.4s\n"
- "fmla v22.4s, v0.4s, v18.4s\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x9, x11]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "add x9, x9, #0x10\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v24.4s, v0.4s, v18.4s\n"
"ldr q0, [x14, #0x10]\n"
- "fmla v21.4s, v2.4s, v17.4s\n"
- "ldr q2, [x14, #0x30]\n"
- "fmla v25.4s, v8.4s, v19.4s\n"
- "ldr q16, [x27, x11]\n"
- "fmla v28.4s, v6.4s, v18.4s\n"
- "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x26, x13]\n"
+ "fmla v27.4s, v3.4s, v18.4s\n"
"ldr q3, [x14, #0x40]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmla v27.4s, v8.4s, v17.4s\n"
- "fmla v24.4s, v5.4s, v17.4s\n"
- "ldr q11, [x16, x26]\n"
+ "fmla v23.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmla v28.4s, v8.4s, v17.4s\n"
+ "fmla v25.4s, v5.4s, v17.4s\n"
+ "ldr q11, [x16, x11]\n"
"ldr q5, [x14, #0x60]\n"
- "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
"ldr q8, [x14, #0x90]\n"
- "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
"ldr q7, [x14, #0x80]\n"
- "fmla v21.4s, v6.4s, v16.4s\n"
- "ldr q13, [x13, x11]\n"
- "ldr q6, [x14, #0x70]\n"
"fmax v27.4s, v27.4s, v15.4s\n"
"fmax v26.4s, v26.4s, v15.4s\n"
+ "add x26, x26, #0x10\n"
+ "ld1 { v12.4s }, [x26]\n"
+ "fmla v23.4s, v6.4s, v16.4s\n"
+ "ldr q13, [x10, x13]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
"fmax v25.4s, v25.4s, v15.4s\n"
- "add x27, x27, #0x10\n"
- "ld1 { v12.4s }, [x27]\n"
- "fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
"add x14, x14, #0xa0\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
"fmax v21.4s, v21.4s, v15.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
"fmin v29.4s, v29.4s, v14.4s\n"
- "st1 { v28.4s }, [x15]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
"fmin v27.4s, v27.4s, v14.4s\n"
"fmin v26.4s, v26.4s, v14.4s\n"
- "str q29, [x15, x17]\n"
"fmin v25.4s, v25.4s, v14.4s\n"
"fmin v24.4s, v24.4s, v14.4s\n"
- "str q27, [x15, x22]\n"
- "add x15, x15, #0x10\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "st1 { v26.4s }, [x28]\n"
+ "st1 { v29.4s }, [x15]\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "str q25, [x28, x17]\n"
- "str q24, [x28, x22]\n"
- "add x28, x28, #0x10\n"
- "st1 { v22.4s }, [x25]\n"
- "str q20, [x25, x17]\n"
- "str q21, [x25, x22]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q30, [x15, x8]\n"
+ "str q28, [x15, x20]\n"
+ "add x15, x15, #0x10\n"
+ "st1 { v27.4s }, [x27]\n"
+ "str q26, [x27, x8]\n"
+ "str q25, [x27, x20]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v24.4s }, [x25]\n"
+ "str q21, [x25, x8]\n"
+ "str q23, [x25, x20]\n"
"add x25, x25, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
- "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
- "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr q23, [x12, x9]\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q18, [x12, x8]\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v6.4s, v18.4s\n"
- "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "fmla v28.4s, v5.4s, v13.4s\n"
- "fmla v27.4s, v3.4s, v13.4s\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v25.4s, v1.4s, v13.4s\n"
- "fmla v24.4s, v0.4s, v13.4s\n"
- "ldr q17, [x16, x8]\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
- "ldr q16, [x27, x26]\n"
- "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
- "fmla v29.4s, v0.4s, v17.4s\n"
- "fmla v21.4s, v8.4s, v16.4s\n"
- "ldr q16, [x16, x9]\n"
- "fmla v28.4s, v7.4s, v18.4s\n"
- "fmla v20.4s, v0.4s, v18.4s\n"
- "fmla v26.4s, v4.4s, v18.4s\n"
- "fmla v25.4s, v3.4s, v18.4s\n"
- "fmla v22.4s, v1.4s, v18.4s\n"
- "ld1 { v19.4s }, [x13]\n"
- "fmla v29.4s, v2.4s, v16.4s\n"
- "fmla v27.4s, v1.4s, v16.4s\n"
- "ld1 { v18.4s }, [x10]\n"
- "fmla v24.4s, v4.4s, v23.4s\n"
- "fmla v28.4s, v1.4s, v17.4s\n"
- "ldr q16, [x13, x26]\n"
- "fmla v20.4s, v2.4s, v23.4s\n"
- "fmla v21.4s, v1.4s, v23.4s\n"
- "fmla v29.4s, v8.4s, v23.4s\n"
- "fmla v27.4s, v7.4s, v23.4s\n"
- "fmla v25.4s, v5.4s, v23.4s\n"
- "ldr q17, [x10, x11]\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v7.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x9, x12]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x9, x7]\n"
+ "fmla v27.4s, v2.4s, v13.4s\n"
+ "fmla v26.4s, v1.4s, v13.4s\n"
+ "fmla v25.4s, v0.4s, v13.4s\n"
+ "fmla v24.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x26, x11]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v6.4s, v17.4s\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x16, x7]\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x12]\n"
+ "fmla v26.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "fmla v30.4s, v0.4s, v18.4s\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ld1 { v20.4s }, [x10]\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v26.4s, v5.4s, v22.4s\n"
+ "fmla v21.4s, v2.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v20.4s\n"
+ "fmla v30.4s, v2.4s, v16.4s\n"
+ "ld1 { v17.4s }, [x28]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q16, [x10, x11]\n"
+ "fmla v28.4s, v7.4s, v22.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v6.4s, v17.4s\n"
+ "ldr q19, [x10, x7]\n"
+ "fmla v30.4s, v8.4s, v22.4s\n"
+ "ldr q18, [x28, x13]\n"
+ "fmla v29.4s, v3.4s, v20.4s\n"
+ "ldr q17, [x28, x11]\n"
+ "fmla v28.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x26, x7]\n"
+ "fmla v21.4s, v4.4s, v18.4s\n"
+ "fmla v23.4s, v3.4s, v18.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v24.4s, v5.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v27.4s, v8.4s, v18.4s\n"
+ "fmla v30.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v29.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v5.4s, v17.4s\n"
"fmla v26.4s, v0.4s, v19.4s\n"
- "fmla v22.4s, v3.4s, v18.4s\n"
- "fmla v24.4s, v2.4s, v16.4s\n"
- "fmla v20.4s, v4.4s, v17.4s\n"
- "fmla v21.4s, v3.4s, v17.4s\n"
- "fmla v28.4s, v3.4s, v19.4s\n"
- "ldr q19, [x10, x26]\n"
- "fmla v27.4s, v5.4s, v16.4s\n"
- "ldr q16, [x27, x8]\n"
- "fmla v26.4s, v6.4s, v18.4s\n"
- "ldr q18, [x13, x8]\n"
- "fmla v25.4s, v7.4s, v17.4s\n"
- "fmla v22.4s, v5.4s, v17.4s\n"
- "fmla v24.4s, v6.4s, v17.4s\n"
- "fmla v21.4s, v5.4s, v19.4s\n"
- "fmla v20.4s, v6.4s, v16.4s\n"
- "fmla v26.4s, v8.4s, v17.4s\n"
- "fmla v22.4s, v7.4s, v16.4s\n"
- "ldr q17, [x27, x9]\n"
- "fmla v29.4s, v3.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v18.4s\n"
- "fmla v24.4s, v8.4s, v19.4s\n"
- "ldr q16, [x13, x9]\n"
- "fmla v20.4s, v8.4s, v17.4s\n"
- "add x13, x13, #0x10\n"
- "fmla v21.4s, v7.4s, v17.4s\n"
- "ldr q19, [x10, x9]\n"
- "fmla v28.4s, v4.4s, v18.4s\n"
- "fmla v26.4s, v1.4s, v18.4s\n"
- "ldr q17, [x10, x8]\n"
- "fmla v29.4s, v5.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "ldr q18, [x26, x12]\n"
+ "fmla v25.4s, v8.4s, v17.4s\n"
+ "ldr q16, [x10, x12]\n"
+ "fmla v27.4s, v1.4s, v19.4s\n"
+ "ldr q17, [x28, x7]\n"
"add x10, x10, #0x10\n"
- "fmla v27.4s, v4.4s, v16.4s\n"
- "fmla v25.4s, v2.4s, v16.4s\n"
- "fmla v24.4s, v1.4s, v16.4s\n"
- "ldr q16, [x16, x11]\n"
- "fmla v22.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v18.4s\n"
+ "fmla v23.4s, v7.4s, v18.4s\n"
+ "ldr q19, [x28, x12]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v2.4s, v16.4s\n"
+ "fmla v25.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x13]\n"
+ "fmla v24.4s, v4.4s, v17.4s\n"
"add x16, x16, #0x10\n"
- "fmla v20.4s, v3.4s, v17.4s\n"
- "fmla v21.4s, v4.4s, v19.4s\n"
- "fmla v26.4s, v7.4s, v17.4s\n"
- "fmla v25.4s, v6.4s, v17.4s\n"
- "ld1 { v18.4s }, [x12]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v1.4s, v16.4s\n"
- "fmax v29.4s, v29.4s, v15.4s\n"
- "fmla v27.4s, v0.4s, v16.4s\n"
- "ldr q17, [x12, x26]\n"
- "fmla v24.4s, v7.4s, v19.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmla v20.4s, v5.4s, v19.4s\n"
- "fmla v22.4s, v0.4s, v18.4s\n"
- "add x12, x12, #0x10\n"
- "fmla v21.4s, v2.4s, v17.4s\n"
- "fmla v25.4s, v8.4s, v19.4s\n"
- "ldr q16, [x27, x11]\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "fmla v28.4s, v6.4s, v18.4s\n"
- "fmla v26.4s, v3.4s, v18.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "add x27, x27, #0x10\n"
- "fmla v27.4s, v8.4s, v17.4s\n"
- "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v27.4s, v7.4s, v17.4s\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "fmla v26.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x9]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x9, x11]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "add x9, x9, #0x10\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v24.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x26, x13]\n"
+ "fmla v27.4s, v3.4s, v18.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmla v23.4s, v2.4s, v17.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmla v28.4s, v8.4s, v17.4s\n"
+ "fmla v25.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
"fmax v27.4s, v27.4s, v15.4s\n"
- "fmla v22.4s, v8.4s, v16.4s\n"
- "fmla v20.4s, v7.4s, v16.4s\n"
"fmax v26.4s, v26.4s, v15.4s\n"
- "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmla v23.4s, v6.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
"fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "fmax v20.4s, v20.4s, v15.4s\n"
"fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
- "st1 { v28.4s }, [x15]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
"fmin v26.4s, v26.4s, v14.4s\n"
- "str q29, [x15, x17]\n"
+ "st1 { v27.4s }, [x27]\n"
"fmin v25.4s, v25.4s, v14.4s\n"
"fmin v24.4s, v24.4s, v14.4s\n"
- "str q27, [x15, x22]\n"
- "add x15, x15, #0x10\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "st1 { v26.4s }, [x28]\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "str q25, [x28, x17]\n"
- "str q24, [x28, x22]\n"
- "add x28, x28, #0x10\n"
- "st1 { v22.4s }, [x25]\n"
- "str q20, [x25, x17]\n"
- "str q21, [x25, x22]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "st1 { v29.4s }, [x15]\n"
+ "str q30, [x15, x8]\n"
+ "str q28, [x15, x20]\n"
+ "add x15, x15, #0x10\n"
+ "str q26, [x27, x8]\n"
+ "str q25, [x27, x20]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v24.4s }, [x25]\n"
+ "str q21, [x25, x8]\n"
+ "str q23, [x25, x20]\n"
"add x25, x25, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 49f\n"
"ldr q31, [x14, #0x0]\n"
"ldr q0, [x14, #0x10]\n"
- "add x24, x12, x11\n"
+ "add x24, x9, x13\n"
"add x23, x16, XZR\n"
"ldr q1, [x14, #0x20]\n"
"ldr q2, [x14, #0x30]\n"
- "add x22, x16, x26\n"
- "add x21, x27, XZR\n"
+ "add x22, x16, x11\n"
+ "add x21, x26, XZR\n"
"ldr q3, [x14, #0x40]\n"
"ldr q4, [x14, #0x50]\n"
- "add x20, x13, x11\n"
+ "add x20, x10, x13\n"
"ldr q5, [x14, #0x60]\n"
"ldr q6, [x14, #0x70]\n"
"ldr q7, [x14, #0x80]\n"
@@ -483,23 +483,23 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
"mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
"mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
- "add x20, x27, x26\n"
+ "add x20, x26, x11\n"
"mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
"mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
"mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
"mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
"fmla v28.4s, v0.4s, v13.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -509,7 +509,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"ldr s12, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v31.4s, v8.4s, v12.4s\n"
- "add x20, x12, x8\n"
+ "add x20, x9, x7\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
@@ -520,7 +520,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
"fmla v23.4s, v7.4s, v11.4s\n"
"fmla v24.4s, v6.4s, v11.4s\n"
- "add x20, x16, x8\n"
+ "add x20, x16, x7\n"
"fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v11.4s\n"
@@ -535,7 +535,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
"fmla v23.4s, v1.4s, v13.4s\n"
"fmla v24.4s, v0.4s, v13.4s\n"
- "add x20, x16, x9\n"
+ "add x20, x16, x12\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -546,7 +546,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
"fmla v24.4s, v2.4s, v12.4s\n"
"fmla v25.4s, v1.4s, v12.4s\n"
- "add x20, x12, x9\n"
+ "add x20, x9, x12\n"
"tbz %x[n_channels], #1, 15f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
@@ -557,7 +557,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
"fmla v24.4s, v8.4s, v10.4s\n"
"fmla v25.4s, v7.4s, v10.4s\n"
- "add x20, x13, XZR\n"
+ "add x20, x10, XZR\n"
"fmla v27.4s, v5.4s, v10.4s\n"
"fmla v28.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
@@ -572,7 +572,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
"fmla v23.4s, v3.4s, v11.4s\n"
"fmla v26.4s, v0.4s, v11.4s\n"
- "add x20, x13, x26\n"
+ "add x20, x10, x11\n"
"tbz %x[n_channels], #1, 19f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
@@ -583,7 +583,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
"fmla v25.4s, v5.4s, v13.4s\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "add x20, x10, XZR\n"
+ "add x20, x28, XZR\n"
"tbz %x[n_channels], #1, 21f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
@@ -594,7 +594,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v26.4s, v6.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
- "add x20, x10, x11\n"
+ "add x20, x28, x13\n"
"tbz %x[n_channels], #1, 23f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
@@ -605,7 +605,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "add x20, x10, x26\n"
+ "add x20, x28, x11\n"
"fmla v28.4s, v6.4s, v10.4s\n"
"fmla v29.4s, v5.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v10.4s\n"
@@ -620,7 +620,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
"fmla v28.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x27, x8\n"
+ "add x20, x26, x7\n"
"tbz %x[n_channels], #1, 27f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
@@ -631,7 +631,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
"fmla v29.4s, v7.4s, v13.4s\n"
"fmla v30.4s, v6.4s, v13.4s\n"
- "add x20, x13, x8\n"
+ "add x20, x10, x7\n"
"tbz %x[n_channels], #1, 29f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
@@ -642,7 +642,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v24.4s, v3.4s, v12.4s\n"
- "add x20, x13, x9\n"
+ "add x20, x10, x12\n"
"fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 31f\n"
@@ -655,7 +655,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v24.4s, v5.4s, v11.4s\n"
"fmla v25.4s, v4.4s, v11.4s\n"
- "add x20, x27, x9\n"
+ "add x20, x26, x12\n"
"fmla v27.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 33f\n"
@@ -668,7 +668,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
"fmla v30.4s, v8.4s, v13.4s\n"
"fmla v31.4s, v7.4s, v13.4s\n"
- "add x20, x10, x8\n"
+ "add x20, x28, x7\n"
"tbz %x[n_channels], #1, 35f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
@@ -679,7 +679,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
"fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "add x20, x16, x11\n"
+ "add x20, x16, x13\n"
"fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 37f\n"
@@ -692,7 +692,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
"fmla v23.4s, v2.4s, v11.4s\n"
"fmla v24.4s, v1.4s, v11.4s\n"
- "add x20, x10, x9\n"
+ "add x20, x28, x12\n"
"fmla v25.4s, v0.4s, v11.4s\n"
"tbz %x[n_channels], #1, 39f\n"
"ldr d13, [x20], #0x8\n"
@@ -704,7 +704,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
"fmla v27.4s, v8.4s, v13.4s\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x12, XZR\n"
+ "add x20, x9, XZR\n"
"fmla v30.4s, v5.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v13.4s\n"
"tbz %x[n_channels], #1, 41f\n"
@@ -717,7 +717,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
"fmla v23.4s, v6.4s, v12.4s\n"
"fmla v26.4s, v3.4s, v12.4s\n"
- "add x20, x12, x26\n"
+ "add x20, x9, x11\n"
"fmla v29.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 43f\n"
"ldr d11, [x20], #0x8\n"
@@ -729,7 +729,7 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
- "add x20, x27, x11\n"
+ "add x20, x26, x13\n"
"fmla v31.4s, v2.4s, v11.4s\n"
"tbz %x[n_channels], #1, 45f\n"
"ldr d13, [x20], #0x8\n"
@@ -762,63 +762,63 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 47f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.d }[0], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.d }[0], [x21], x17\n"
"add x15, x15, #0x8\n"
- "st1 { v29.d }[0], [x20], x17\n"
- "add x28, x28, #0x8\n"
+ "add x27, x27, #0x8\n"
"add x25, x25, #0x8\n"
- "st1 { v24.d }[0], [x22], x17\n"
- "st1 { v27.d }[0], [x21], x17\n"
- "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v23.d }[0], [x22], x8\n"
+ "st1 { v26.d }[0], [x21], x8\n"
+ "st1 { v29.d }[0], [x20], x8\n"
+ "st1 { v24.d }[0], [x22], x8\n"
+ "st1 { v27.d }[0], [x21], x8\n"
+ "st1 { v30.d }[0], [x20], x8\n"
"st1 { v25.d }[0], [x22]\n"
"st1 { v28.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 48f\n"
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.s }[2], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.s }[2], [x21], x17\n"
- "st1 { v29.s }[2], [x20], x17\n"
- "st1 { v24.s }[2], [x22], x17\n"
- "st1 { v27.s }[2], [x21], x17\n"
- "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v23.s }[2], [x22], x8\n"
+ "st1 { v24.s }[2], [x22], x8\n"
+ "st1 { v26.s }[2], [x21], x8\n"
+ "st1 { v29.s }[2], [x20], x8\n"
+ "st1 { v27.s }[2], [x21], x8\n"
+ "st1 { v30.s }[2], [x20], x8\n"
"st1 { v25.s }[2], [x22]\n"
"st1 { v28.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"b 48f\n"
"47:" // Tile loop: Oddments: Store: Bit 1: Unset
"mov x22, x15\n"
- "mov x21, x28\n"
- "st1 { v23.s }[0], [x22], x17\n"
+ "mov x21, x27\n"
"mov x20, x25\n"
- "st1 { v26.s }[0], [x21], x17\n"
- "st1 { v29.s }[0], [x20], x17\n"
- "st1 { v24.s }[0], [x22], x17\n"
- "st1 { v27.s }[0], [x21], x17\n"
- "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v23.s }[0], [x22], x8\n"
+ "st1 { v24.s }[0], [x22], x8\n"
+ "st1 { v26.s }[0], [x21], x8\n"
+ "st1 { v29.s }[0], [x20], x8\n"
+ "st1 { v27.s }[0], [x21], x8\n"
+ "st1 { v30.s }[0], [x20], x8\n"
"st1 { v25.s }[0], [x22]\n"
"st1 { v28.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"48:" // Tile loop: Oddments: Store: Bit 1: End
"49:" // Tile loop: End
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x23, x23, #0x1\n"
- "add x21, x24, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x23, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x24, x24, x21, LT\n"
- "csel x23, x23, XZR, LT\n"
- "cmp x24, x20\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x9, x9, #0x1\n"
+ "add x20, x10, #0x1\n"
+ "cmp x9, x22\n"
+ "csel x10, x10, x20, LT\n"
+ "csel x9, x9, XZR, LT\n"
+ "cmp x10, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 972f7eb535..5efd35135b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,9 +91,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"lsr x8, %x[n_channels], #0x2\n"
"ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x21]\n"
"ld1r { v14.4s }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"mov x14, #0x0\n"
@@ -111,357 +111,357 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q7, [x16, #0x80]\n"
"ldr q8, [x16, #0x90]\n"
"add x16, x16, #0xa0\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q9, [x21, x14]\n"
- "ldr q10, [x20, x14]\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q11, [x21, x14]\n"
- "ldr q12, [x20, x14]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
"ldr x20, [x15, #0x20]\n"
+ "ldr q9, [x24, x14]\n"
+ "ldr q10, [x23, x14]\n"
+ "ldr q11, [x22, x14]\n"
+ "ldr q12, [x21, x14]\n"
"ldr q13, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
- "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
- "ldr x26, [x15, #0x30]\n"
- "ldr x23, [x15, #0x38]\n"
- "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
- "ldr x22, [x15, #0x28]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr q19, [x20, x14]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
- "ldr x21, [x15, #0x40]\n"
- "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
- "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x27, [x15, #0x38]\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "ldr x26, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x15, #0x40]\n"
"ldr x25, [x15, #0x50]\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v0.4s, v9.4s\n"
"ldr x24, [x15, #0x58]\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q17, [x26, x14]\n"
- "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "ldr x20, [x15, #0x60]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "fmla v28.4s, v6.4s, v17.4s\n"
- "ldr x12, [x15, #0x70]\n"
- "ldr x11, [x15, #0x88]\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
- "fmla v27.4s, v3.4s, v13.4s\n"
- "ldr x10, [x17, #0x0]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr x12, [x15, #0x88]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x22, x14]\n"
+ "fmla v27.4s, v2.4s, v13.4s\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v26.4s, v1.4s, v13.4s\n"
+ "fmla v25.4s, v0.4s, v13.4s\n"
+ "ldr x11, [x17, #0x0]\n"
"add x13, x13, #0x10\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v25.4s, v1.4s, v13.4s\n"
- "ldr x9, [x17, #0x8]\n"
- "ldr x28, [x17, #0x10]\n"
- "fmla v24.4s, v0.4s, v13.4s\n"
- "ldr q18, [x23, x14]\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "ldr q16, [x22, x14]\n"
+ "fmla v24.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x26, x14]\n"
"mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
"ldr q31, [x16, #0x0]\n"
- "fmla v29.4s, v7.4s, v17.4s\n"
- "ldr x23, [x15, #0x68]\n"
- "fmla v28.4s, v0.4s, v18.4s\n"
- "fmla v22.4s, v8.4s, v16.4s\n"
- "ldr q16, [x21, x14]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla v26.4s, v4.4s, v17.4s\n"
- "fmla v25.4s, v3.4s, v17.4s\n"
- "ldr x21, [x15, #0x80]\n"
- "ldr x27, [x17, #0x18]\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v17.4s\n"
+ "ldr x21, [x15, #0x68]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x27, x14]\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "ldr x9, [x15, #0x78]\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v26.4s, v3.4s, v17.4s\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v21.4s, v0.4s, v17.4s\n"
- "fmla v24.4s, v4.4s, v19.4s\n"
- "fmla v23.4s, v1.4s, v17.4s\n"
+ "fmla v25.4s, v4.4s, v22.4s\n"
+ "ldr x28, [x17, #0x10]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v30.4s, v7.4s, v17.4s\n"
+ "fmla v29.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
"ldr q17, [x25, x14]\n"
- "fmla v29.4s, v1.4s, v18.4s\n"
- "ldr q20, [x24, x14]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v27.4s, v1.4s, v16.4s\n"
- "ldr q16, [x20, x14]\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
"ldr x26, [x15, #0x90]\n"
- "fmla v25.4s, v5.4s, v19.4s\n"
- "fmla v21.4s, v2.4s, v19.4s\n"
- "ldr x25, [x15, #0xa0]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla v26.4s, v0.4s, v17.4s\n"
- "fmla v24.4s, v2.4s, v20.4s\n"
- "fmla v28.4s, v8.4s, v19.4s\n"
- "fmla v27.4s, v7.4s, v19.4s\n"
- "fmla v22.4s, v1.4s, v19.4s\n"
- "ldr q19, [x23, x14]\n"
- "fmla v23.4s, v3.4s, v16.4s\n"
- "ldr x24, [x15, #0xa8]\n"
- "fmla v26.4s, v6.4s, v16.4s\n"
- "ldr q18, [x21, x14]\n"
- "fmla v25.4s, v7.4s, v19.4s\n"
- "ldr x23, [x15, #0xc0]\n"
- "fmla v24.4s, v6.4s, v19.4s\n"
- "fmla v21.4s, v4.4s, v19.4s\n"
- "fmla v29.4s, v3.4s, v17.4s\n"
- "ldr q17, [x12, x14]\n"
- "fmla v27.4s, v5.4s, v20.4s\n"
- "ldr q16, [x22, x14]\n"
- "fmla v23.4s, v5.4s, v19.4s\n"
- "fmla v22.4s, v3.4s, v19.4s\n"
- "ldr x22, [x15, #0xb0]\n"
- "ldr x21, [x15, #0xb8]\n"
- "fmla v26.4s, v8.4s, v19.4s\n"
- "fmla v24.4s, v8.4s, v17.4s\n"
- "fmla v21.4s, v6.4s, v16.4s\n"
- "fmla v28.4s, v3.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v18.4s\n"
- "fmla v22.4s, v5.4s, v17.4s\n"
- "ldr q17, [x11, x14]\n"
- "fmla v23.4s, v7.4s, v16.4s\n"
- "ldr q16, [x26, x14]\n"
- "fmla v29.4s, v4.4s, v18.4s\n"
- "fmla v26.4s, v1.4s, v18.4s\n"
+ "fmla v26.4s, v5.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v2.4s, v22.4s\n"
+ "fmla v30.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.4s, v7.4s, v22.4s\n"
+ "fmla v25.4s, v2.4s, v20.4s\n"
+ "fmla v24.4s, v3.4s, v16.4s\n"
+ "fmla v30.4s, v3.4s, v17.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla v29.4s, v8.4s, v22.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
"ldr q18, [x20, x14]\n"
- "fmla v28.4s, v5.4s, v17.4s\n"
- "fmla v27.4s, v4.4s, v17.4s\n"
- "fmla v25.4s, v2.4s, v17.4s\n"
- "fmla v24.4s, v1.4s, v17.4s\n"
- "ldr q17, [x25, x14]\n"
- "fmla v21.4s, v8.4s, v16.4s\n"
- "ldr x20, [x15, #0x20]\n"
- "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x9, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmla v23.4s, v3.4s, v17.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v30.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "fmla v26.4s, v0.4s, v18.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "ldr q16, [x12, x14]\n"
+ "fmla v27.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x25, x14]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v2.4s, v16.4s\n"
+ "fmla v25.4s, v1.4s, v16.4s\n"
"ldr q16, [x24, x14]\n"
- "fmla v29.4s, v2.4s, v17.4s\n"
- "fmla v26.4s, v7.4s, v18.4s\n"
- "fmla v25.4s, v6.4s, v18.4s\n"
- "fmla v23.4s, v4.4s, v18.4s\n"
- "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr x24, [x15, #0x20]\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v7.4s, v17.4s\n"
"ldr q18, [x22, x14]\n"
- "fmla v22.4s, v4.4s, v16.4s\n"
- "ldr q4, [x16, #0x50]\n"
- "fmla v28.4s, v1.4s, v17.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v30.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v27.4s, v0.4s, v17.4s\n"
- "ldr q17, [x21, x14]\n"
- "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v6.4s, v19.4s\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v21.4s, v3.4s, v19.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v25.4s, v7.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "ldr q4, [x16, #0x50]\n"
"fmax v29.4s, v29.4s, v15.4s\n"
- "fmla v24.4s, v7.4s, v16.4s\n"
- "fmla v21.4s, v5.4s, v16.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q29, [x10, x13]\n"
- "fmla v23.4s, v0.4s, v18.4s\n"
+ "fmla v30.4s, v6.4s, v16.4s\n"
+ "fmla v24.4s, v0.4s, v16.4s\n"
"ldr q0, [x16, #0x10]\n"
- "fmla v22.4s, v2.4s, v17.4s\n"
- "ldr q2, [x16, #0x30]\n"
- "fmla v25.4s, v8.4s, v16.4s\n"
- "ldr q16, [x23, x14]\n"
- "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmla v26.4s, v8.4s, v18.4s\n"
+ "fmla v27.4s, v3.4s, v16.4s\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v27.4s, v8.4s, v17.4s\n"
- "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v25.4s, v5.4s, v17.4s\n"
"ldr q5, [x16, #0x60]\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmla v23.4s, v8.4s, v16.4s\n"
+ "fmla v23.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
"ldr q8, [x16, #0x90]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "ldr q9, [x23, x7]\n"
+ "ldr q10, [x22, x7]\n"
"fmla v21.4s, v7.4s, v16.4s\n"
"ldr q7, [x16, #0x80]\n"
- "fmla v22.4s, v6.4s, v16.4s\n"
- "ldr q13, [x20, x7]\n"
- "ldr q6, [x16, #0x70]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "ldr q11, [x21, x7]\n"
+ "ldr q12, [x20, x7]\n"
+ "fmla v23.4s, v6.4s, v16.4s\n"
"fmax v27.4s, v27.4s, v15.4s\n"
- "fmax v26.4s, v26.4s, v15.4s\n"
- "fmax v25.4s, v25.4s, v15.4s\n"
- "ldr x24, [x17, #0x20]\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q9, [x21, x7]\n"
- "ldr q10, [x20, x7]\n"
+ "ldr q13, [x24, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
"fmin v28.4s, v28.4s, v14.4s\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q11, [x21, x7]\n"
"fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "ldr q12, [x20, x7]\n"
+ "str q30, [x11, x13]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
"fmax v24.4s, v24.4s, v15.4s\n"
- "fmax v23.4s, v23.4s, v15.4s\n"
- "str q28, [x9, x13]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
"fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "str q27, [x28, x13]\n"
+ "str q29, [x10, x13]\n"
"ldr x23, [x17, #0x28]\n"
- "str q26, [x27, x13]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q28, [x28, x13]\n"
"ldr x22, [x17, #0x30]\n"
- "ldr x21, [x17, #0x38]\n"
"add x7, x7, #0x10\n"
- "str q25, [x24, x13]\n"
- "ldr x20, [x17, #0x40]\n"
+ "str q26, [x20, x13]\n"
+ "ldr x21, [x17, #0x40]\n"
"cmp x7, x8, LSL #4\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q27, [x27, x13]\n"
+ "ldr x20, [x17, #0x38]\n"
"fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "add x14, x14, #0x10\n"
- "str q24, [x23, x13]\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "str q23, [x22, x13]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
"add x16, x16, #0xa0\n"
- "str q21, [x21, x13]\n"
- "str q22, [x20, x13]\n"
+ "str q25, [x23, x13]\n"
+ "str q24, [x22, x13]\n"
+ "str q21, [x20, x13]\n"
+ "str q23, [x21, x13]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
- "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
- "ldr x23, [x15, #0x30]\n"
- "ldr x22, [x15, #0x38]\n"
- "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x48]\n"
- "ldr q19, [x20, x14]\n"
- "fmla v28.4s, v4.4s, v13.4s\n"
- "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x27, [x15, #0x38]\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "ldr x26, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v3.4s, v9.4s\n"
"ldr x20, [x15, #0x40]\n"
- "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
- "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
"ldr x25, [x15, #0x50]\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v0.4s, v9.4s\n"
"ldr x24, [x15, #0x58]\n"
- "fmla v27.4s, v2.4s, v11.4s\n"
- "ldr q17, [x23, x14]\n"
- "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
"ldr x23, [x15, #0x60]\n"
- "fmla v29.4s, v5.4s, v13.4s\n"
- "fmla v28.4s, v6.4s, v17.4s\n"
- "ldr x12, [x15, #0x70]\n"
- "ldr x11, [x15, #0x88]\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
- "fmla v27.4s, v3.4s, v13.4s\n"
- "ldr x10, [x17, #0x0]\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "ldr x12, [x15, #0x88]\n"
+ "fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x22, x14]\n"
+ "fmla v27.4s, v2.4s, v13.4s\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v26.4s, v1.4s, v13.4s\n"
+ "fmla v25.4s, v0.4s, v13.4s\n"
+ "ldr x11, [x17, #0x0]\n"
"add x13, x13, #0x10\n"
- "fmla v26.4s, v2.4s, v13.4s\n"
- "fmla v25.4s, v1.4s, v13.4s\n"
- "ldr x9, [x17, #0x8]\n"
- "ldr x28, [x17, #0x10]\n"
- "fmla v24.4s, v0.4s, v13.4s\n"
- "ldr q18, [x22, x14]\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "ldr q16, [x21, x14]\n"
+ "fmla v24.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x26, x14]\n"
"mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
- "fmla v29.4s, v7.4s, v17.4s\n"
- "ldr x22, [x15, #0x68]\n"
- "ldr x21, [x15, #0x78]\n"
- "fmla v28.4s, v0.4s, v18.4s\n"
- "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v17.4s\n"
+ "ldr x10, [x17, #0x8]\n"
+ "ldr x9, [x17, #0x10]\n"
+ "fmla v28.4s, v3.4s, v13.4s\n"
+ "ldr q18, [x27, x14]\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "ldr x28, [x15, #0x78]\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
"ldr q16, [x20, x14]\n"
+ "fmla v26.4s, v3.4s, v17.4s\n"
"ldr x20, [x15, #0x80]\n"
- "fmla v26.4s, v4.4s, v17.4s\n"
- "fmla v25.4s, v3.4s, v17.4s\n"
- "ldr x27, [x17, #0x18]\n"
"fmla v21.4s, v0.4s, v17.4s\n"
- "fmla v24.4s, v4.4s, v19.4s\n"
- "fmla v23.4s, v1.4s, v17.4s\n"
+ "fmla v25.4s, v4.4s, v22.4s\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v30.4s, v7.4s, v17.4s\n"
+ "fmla v29.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
"ldr q17, [x25, x14]\n"
- "fmla v29.4s, v1.4s, v18.4s\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v26.4s, v5.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v2.4s, v22.4s\n"
+ "fmla v30.4s, v1.4s, v18.4s\n"
"ldr q20, [x24, x14]\n"
- "fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
"ldr q16, [x23, x14]\n"
- "ldr x26, [x15, #0x90]\n"
- "fmla v25.4s, v5.4s, v19.4s\n"
- "fmla v21.4s, v2.4s, v19.4s\n"
- "ldr x25, [x15, #0xa0]\n"
- "ldr x24, [x15, #0x98]\n"
- "fmla v26.4s, v0.4s, v17.4s\n"
- "fmla v24.4s, v2.4s, v20.4s\n"
- "fmla v28.4s, v8.4s, v19.4s\n"
- "fmla v27.4s, v7.4s, v19.4s\n"
- "fmla v22.4s, v1.4s, v19.4s\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.4s, v7.4s, v22.4s\n"
+ "fmla v25.4s, v2.4s, v20.4s\n"
+ "fmla v24.4s, v3.4s, v16.4s\n"
+ "fmla v30.4s, v3.4s, v17.4s\n"
"ldr q19, [x22, x14]\n"
- "fmla v23.4s, v3.4s, v16.4s\n"
- "ldr x23, [x15, #0xa8]\n"
- "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr x23, [x15, #0xb0]\n"
+ "fmla v29.4s, v8.4s, v22.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr x22, [x15, #0xa8]\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
"ldr q18, [x20, x14]\n"
- "fmla v25.4s, v7.4s, v19.4s\n"
- "ldr x22, [x15, #0xc0]\n"
- "fmla v24.4s, v6.4s, v19.4s\n"
- "fmla v21.4s, v4.4s, v19.4s\n"
- "fmla v29.4s, v3.4s, v17.4s\n"
- "ldr q17, [x12, x14]\n"
- "fmla v27.4s, v5.4s, v20.4s\n"
- "ldr q16, [x21, x14]\n"
- "fmla v23.4s, v5.4s, v19.4s\n"
- "fmla v22.4s, v3.4s, v19.4s\n"
- "ldr x21, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x28, x14]\n"
"ldr x20, [x15, #0xb8]\n"
- "fmla v26.4s, v8.4s, v19.4s\n"
- "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmla v23.4s, v3.4s, v17.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v30.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "fmla v26.4s, v0.4s, v18.4s\n"
"fmla v21.4s, v6.4s, v16.4s\n"
- "fmla v28.4s, v3.4s, v18.4s\n"
- "fmla v25.4s, v0.4s, v18.4s\n"
- "fmla v22.4s, v5.4s, v17.4s\n"
- "ldr q17, [x11, x14]\n"
- "fmla v23.4s, v7.4s, v16.4s\n"
- "ldr q16, [x26, x14]\n"
- "fmla v29.4s, v4.4s, v18.4s\n"
- "fmla v26.4s, v1.4s, v18.4s\n"
- "ldr q18, [x24, x14]\n"
- "fmla v28.4s, v5.4s, v17.4s\n"
- "fmla v27.4s, v4.4s, v17.4s\n"
- "fmla v25.4s, v2.4s, v17.4s\n"
- "fmla v24.4s, v1.4s, v17.4s\n"
- "ldr q17, [x25, x14]\n"
- "fmla v21.4s, v8.4s, v16.4s\n"
- "fmla v22.4s, v7.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "ldr q16, [x12, x14]\n"
+ "fmla v27.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x25, x14]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v2.4s, v16.4s\n"
+ "fmla v25.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v7.4s, v17.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v30.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmla v26.4s, v6.4s, v19.4s\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v21.4s, v3.4s, v19.4s\n"
"ldr q16, [x23, x14]\n"
- "fmla v29.4s, v2.4s, v17.4s\n"
- "fmla v26.4s, v7.4s, v18.4s\n"
- "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v25.4s, v7.4s, v18.4s\n"
"fmla v23.4s, v4.4s, v18.4s\n"
- "fmla v21.4s, v3.4s, v18.4s\n"
- "ldr q18, [x21, x14]\n"
- "fmla v22.4s, v4.4s, v16.4s\n"
- "fmla v28.4s, v1.4s, v17.4s\n"
- "fmax v28.4s, v28.4s, v15.4s\n"
- "fmla v27.4s, v0.4s, v17.4s\n"
- "ldr q17, [x20, x14]\n"
- "fmla v29.4s, v6.4s, v18.4s\n"
"fmax v29.4s, v29.4s, v15.4s\n"
- "fmla v24.4s, v7.4s, v16.4s\n"
- "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmla v30.4s, v6.4s, v16.4s\n"
+ "fmla v24.4s, v0.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v18.4s\n"
+ "fmla v27.4s, v3.4s, v16.4s\n"
+ "fmla v28.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v25.4s, v5.4s, v17.4s\n"
"fmin v29.4s, v29.4s, v14.4s\n"
- "str q29, [x10, x13]\n"
- "fmla v23.4s, v0.4s, v18.4s\n"
- "fmla v22.4s, v2.4s, v17.4s\n"
- "ldr x20, [x17, #0x20]\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "fmla v25.4s, v8.4s, v16.4s\n"
- "ldr q16, [x22, x14]\n"
- "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmla v23.4s, v2.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "add x14, x14, #0x10\n"
"fmax v26.4s, v26.4s, v15.4s\n"
- "fmla v27.4s, v8.4s, v17.4s\n"
- "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
"fmax v27.4s, v27.4s, v15.4s\n"
- "str q28, [x9, x13]\n"
- "fmla v23.4s, v8.4s, v16.4s\n"
+ "str q29, [x10, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
"fmla v21.4s, v7.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmla v23.4s, v6.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
"fmax v25.4s, v25.4s, v15.4s\n"
- "ldr x23, [x17, #0x28]\n"
- "fmla v22.4s, v6.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
"fmin v27.4s, v27.4s, v14.4s\n"
- "str q27, [x28, x13]\n"
- "ldr x22, [x17, #0x30]\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q26, [x27, x13]\n"
- "ldr x21, [x17, #0x38]\n"
+ "str q30, [x11, x13]\n"
+ "ldr x20, [x17, #0x20]\n"
"fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
"fmax v23.4s, v23.4s, v15.4s\n"
- "str q25, [x20, x13]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q28, [x9, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "str q27, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "str q26, [x20, x13]\n"
"ldr x20, [x17, #0x40]\n"
- "fmax v21.4s, v21.4s, v15.4s\n"
- "fmax v22.4s, v22.4s, v15.4s\n"
- "add x14, x14, #0x10\n"
"fmin v24.4s, v24.4s, v14.4s\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "str q24, [x23, x13]\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "fmin v22.4s, v22.4s, v14.4s\n"
- "str q23, [x22, x13]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q25, [x23, x13]\n"
+ "str q24, [x22, x13]\n"
"str q21, [x21, x13]\n"
- "str q22, [x20, x13]\n"
+ "str q23, [x20, x13]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 48f\n"
@@ -478,13 +478,13 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr q8, [x16, #0x90]\n"
"ldr x24, [x15, #0x0]\n"
"ldr x23, [x15, #0x8]\n"
- "add x24, x24, x14\n"
- "add x23, x23, x14\n"
"ldr x22, [x15, #0x10]\n"
"ldr x21, [x15, #0x18]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
"add x22, x22, x14\n"
"add x21, x21, x14\n"
- "ldr x20, [x15, #0x20]\n"
"add x20, x20, x14\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x24], #0x8\n"
@@ -509,23 +509,23 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
"mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
"ldr x20, [x15, #0x28]\n"
- "add x20, x20, x14\n"
"mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
"mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
"mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
"mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
"mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "add x20, x20, x14\n"
"fmla v23.4s, v0.4s, v10.4s\n"
"fmla v25.4s, v2.4s, v11.4s\n"
- "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
"fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "fmla v23.4s, v5.4s, v13.4s\n"
"fmla v24.4s, v4.4s, v13.4s\n"
- "fmla v25.4s, v3.4s, v13.4s\n"
"fmla v26.4s, v2.4s, v13.4s\n"
"fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
"fmla v28.4s, v0.4s, v13.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
"tbz %x[n_channels], #1, 6f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
@@ -548,11 +548,11 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x38]\n"
"fmla v23.4s, v7.4s, v11.4s\n"
"fmla v24.4s, v6.4s, v11.4s\n"
- "add x20, x20, x14\n"
"fmla v26.4s, v4.4s, v11.4s\n"
"fmla v27.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v11.4s\n"
"fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -588,11 +588,11 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x50]\n"
"fmla v24.4s, v8.4s, v10.4s\n"
"fmla v25.4s, v7.4s, v10.4s\n"
- "add x20, x20, x14\n"
"fmla v27.4s, v5.4s, v10.4s\n"
"fmla v28.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
"fmla v31.4s, v1.4s, v10.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -640,11 +640,11 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x70]\n"
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "add x20, x20, x14\n"
"fmla v28.4s, v6.4s, v10.4s\n"
"fmla v29.4s, v5.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v10.4s\n"
"fmla v31.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -680,9 +680,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x88]\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v24.4s, v3.4s, v12.4s\n"
- "add x20, x20, x14\n"
"fmla v26.4s, v1.4s, v12.4s\n"
"fmla v27.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 30f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
@@ -694,9 +694,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x90]\n"
"fmla v24.4s, v5.4s, v11.4s\n"
"fmla v25.4s, v4.4s, v11.4s\n"
- "add x20, x20, x14\n"
"fmla v27.4s, v2.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 32f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
@@ -720,9 +720,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xa0]\n"
"fmla v26.4s, v7.4s, v12.4s\n"
"fmla v27.4s, v6.4s, v12.4s\n"
- "add x20, x20, x14\n"
"fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 36f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
@@ -734,8 +734,8 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xa8]\n"
"fmla v23.4s, v2.4s, v11.4s\n"
"fmla v24.4s, v1.4s, v11.4s\n"
- "add x20, x20, x14\n"
"fmla v25.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
@@ -747,9 +747,9 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xb0]\n"
"fmla v27.4s, v8.4s, v13.4s\n"
"fmla v28.4s, v7.4s, v13.4s\n"
- "add x20, x20, x14\n"
"fmla v30.4s, v5.4s, v13.4s\n"
"fmla v31.4s, v4.4s, v13.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 40f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
@@ -761,8 +761,8 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xb8]\n"
"fmla v23.4s, v6.4s, v12.4s\n"
"fmla v26.4s, v3.4s, v12.4s\n"
- "add x20, x20, x14\n"
"fmla v29.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 42f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
@@ -774,8 +774,8 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0xc0]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
- "add x20, x20, x14\n"
"fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
@@ -807,88 +807,88 @@ void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"fmin v31.4s, v31.4s, v14.4s\n"
"tbz %x[n_channels], #1, 46f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.d }[0], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.d }[0], [x23]\n"
"st1 { v25.d }[0], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.d }[0], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.d }[0], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
- "add x13, x13, #0x8\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.d }[0], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.d }[0], [x22]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 47f\n"
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.s }[2], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.s }[2], [x23]\n"
"st1 { v25.s }[2], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.s }[2], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.s }[2], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.s }[2], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.s }[2], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Store: Bit 1: Unset
"ldr x20, [x17, #0x0]\n"
- "add x20, x20, x13\n"
- "st1 { v23.s }[0], [x20]\n"
"ldr x23, [x17, #0x8]\n"
"ldr x22, [x17, #0x10]\n"
"ldr x21, [x17, #0x18]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x20]\n"
"add x23, x23, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x17, #0x28]\n"
"add x22, x22, x13\n"
- "ldr x20, [x17, #0x20]\n"
"add x21, x21, x13\n"
- "add x20, x20, x13\n"
- "st1 { v24.s }[0], [x23]\n"
"st1 { v25.s }[0], [x22]\n"
- "ldr x23, [x17, #0x28]\n"
"ldr x22, [x17, #0x30]\n"
- "add x23, x23, x13\n"
"st1 { v26.s }[0], [x21]\n"
"ldr x21, [x17, #0x38]\n"
- "add x22, x22, x13\n"
- "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
"st1 { v27.s }[0], [x20]\n"
"ldr x20, [x17, #0x40]\n"
- "add x20, x20, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"st1 { v28.s }[0], [x23]\n"
+ "add x21, x21, x13\n"
"st1 { v29.s }[0], [x22]\n"
+ "add x20, x20, x13\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"47:" // Oddments: Store: Bit 1: End
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 3adf8b0d9f..6de6c3658e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,56 +87,56 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
+ "mov x28, #0x0\n"
"mov x27, #0x0\n"
- "mov x26, #0x0\n"
"1:" // Tile loop
- "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x25, #0x4\n"
- "mov x23, #0x4\n"
- "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x21, #0x4\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x3, #0x10\n" // cntb _, ALL, #1
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
"ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
- "mov x6, #0x10\n" // cntb _, ALL, #1
- "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
- "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "lsl x4, x4, #0x2\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
- "lsl x5, x5, #0x2\n"
- "add x17, x4, x4\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x7, x7, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x15, x7, x24, LSL #2\n"
- "mul x20, x20, x23\n" // offset *= output_tile_size
- "add x14, x15, x24, LSL #2\n"
- "add x8, x8, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "lsr x13, %x[n_channels], #0x2\n"
- "add x12, x14, x24, LSL #2\n"
- "add x11, x17, x4\n"
- "add x10, x8, x22, LSL #2\n"
- "add x9, x12, x24, LSL #2\n"
- "add x28, x11, x4\n"
- "add x27, x10, x22, LSL #2\n"
- "add x23, x5, x5\n"
+ "lsr x6, %x[n_channels], #0x2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x8, #0x0\n"
"ld1r { v15.4s }, [x20]\n"
- "add x26, x9, x24, LSL #2\n"
- "add x25, x28, x4\n"
- "add x24, x27, x22, LSL #2\n"
- "add x22, x23, x5\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x6\n"
- "cbz x13, 4f\n"
- "ldr q14, [x16, #0x0]\n"
+ "mul x24, x28, x25\n" // offset = tile_i * ld_input_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x23, XZR, x3\n"
+ "mul x22, x28, x2\n" // offset = tile_i * ld_output_row
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x24, x27, x4, x24\n" // offset += tile_j * ld_input_col
+ "lsl x4, x4, #0x2\n"
+ "madd x22, x27, x5, x22\n" // offset += tile_j * ld_output_col
+ "lsl x5, x5, #0x2\n"
+ "mul x24, x24, x26\n" // offset *= kernel_stride * output_size
+ "add x15, x4, x4\n"
+ "add x14, x15, x4\n"
+ "add x13, x14, x4\n"
+ "mul x22, x22, x21\n" // offset *= output_tile_size
+ "add x21, x5, x5\n"
+ "add x12, x13, x4\n"
+ "add x20, x21, x5\n"
+ "add x7, x7, x24, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x11, x7, x25, LSL #2\n"
+ "add x10, x11, x25, LSL #2\n"
+ "add x17, x17, x22, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x9, x10, x25, LSL #2\n"
+ "add x28, x17, x2, LSL #2\n"
+ "add x27, x9, x25, LSL #2\n"
+ "add x26, x28, x2, LSL #2\n"
+ "add x25, x27, x25, LSL #2\n"
+ "add x24, x26, x2, LSL #2\n"
+ "cbz x6, 4f\n"
+ "ldr q13, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "cmp x6, x13, LSL #4\n"
+ "cmp x3, x6, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
@@ -146,512 +146,512 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr q7, [x16, #0x80]\n"
"ldr q8, [x16, #0x90]\n"
"add x16, x16, #0xa0\n"
- "ldr q9, [x14, x17]\n"
+ "ldr q9, [x10, x15]\n"
"ld1 { v10.4s }, [x7]\n"
- "ldr q11, [x7, x25]\n"
- "ldr q12, [x14, x11]\n"
+ "ldr q11, [x7, x12]\n"
+ "ldr q12, [x10, x14]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
- "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "add x6, x6, #0x10\n"
- "cmp x6, x13, LSL #4\n"
- "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
- "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
- "add x20, x20, #0x10\n"
- "add x21, x21, #0x10\n"
- "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
- "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
- "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
- "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
- "ldr q9, [x12, x17]\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ld1 { v30.4s }, [x26]\n"
- "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q27, [x26, x25]\n"
- "fmla v16.4s, v4.4s, v12.4s\n"
- "fmla v22.4s, v2.4s, v12.4s\n"
- "fmla v23.4s, v1.4s, v12.4s\n"
- "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
- "ldr q10, [x12, x11]\n"
- "fmla v26.4s, v7.4s, v9.4s\n"
- "fmla v25.4s, v8.4s, v12.4s\n"
- "fmla v17.4s, v7.4s, v12.4s\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
- "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q11, [x7, x4]\n"
- "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
- "ldr q12, [x7, x28]\n"
- "fmla v16.4s, v6.4s, v9.4s\n"
- "fmla v22.4s, v4.4s, v9.4s\n"
- "fmla v23.4s, v3.4s, v9.4s\n"
- "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
- "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
- "ldr q14, [x16, #0x0]\n"
- "fmla v31.4s, v8.4s, v9.4s\n"
- "fmla v20.4s, v5.4s, v9.4s\n"
- "fmla v21.4s, v2.4s, v9.4s\n"
- "ld1 { v9.4s }, [x15]\n"
- "fmla v26.4s, v8.4s, v10.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
- "fmla v25.4s, v0.4s, v11.4s\n"
- "ldr q11, [x15, x25]\n"
- "fmla v17.4s, v2.4s, v12.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "ld1 { v12.4s }, [x9]\n"
- "fmla v16.4s, v7.4s, v10.4s\n"
- "fmla v24.4s, v6.4s, v10.4s\n"
- "fmla v22.4s, v5.4s, v10.4s\n"
- "fmla v23.4s, v4.4s, v10.4s\n"
- "fmla v19.4s, v3.4s, v10.4s\n"
- "fmla v27.4s, v2.4s, v10.4s\n"
- "fmla v18.4s, v1.4s, v10.4s\n"
- "fmla v30.4s, v0.4s, v10.4s\n"
- "ldr q10, [x15, x17]\n"
- "fmla v31.4s, v0.4s, v9.4s\n"
- "fmla v20.4s, v6.4s, v12.4s\n"
- "fmla v21.4s, v3.4s, v12.4s\n"
- "ldr q12, [x9, x25]\n"
- "fmla v26.4s, v1.4s, v10.4s\n"
- "fmla v28.4s, v3.4s, v9.4s\n"
- "fmla v29.4s, v5.4s, v11.4s\n"
- "fmla v24.4s, v2.4s, v11.4s\n"
- "ldr q11, [x15, x11]\n"
- "fmla v25.4s, v4.4s, v10.4s\n"
- "fmla v17.4s, v3.4s, v10.4s\n"
- "fmla v16.4s, v0.4s, v10.4s\n"
- "fmla v19.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v5.4s, v12.4s\n"
- "ldr q9, [x26, x4]\n"
- "fmla v31.4s, v2.4s, v10.4s\n"
- "fmla v26.4s, v2.4s, v11.4s\n"
- "fmla v28.4s, v5.4s, v10.4s\n"
- "ldr q10, [x14, x4]\n"
- "fmla v25.4s, v5.4s, v11.4s\n"
- "fmla v17.4s, v4.4s, v11.4s\n"
- "fmla v29.4s, v3.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "fmla v24.4s, v0.4s, v11.4s\n"
- "ldr q11, [x14, x28]\n"
- "fmla v21.4s, v7.4s, v9.4s\n"
- "fmla v27.4s, v6.4s, v9.4s\n"
- "ldr q12, [x26, x28]\n"
- "fmla v31.4s, v4.4s, v10.4s\n"
- "fmla v26.4s, v3.4s, v10.4s\n"
- "fmla v20.4s, v1.4s, v10.4s\n"
- "fmla v22.4s, v0.4s, v10.4s\n"
- "fmla v28.4s, v7.4s, v10.4s\n"
- "fmla v25.4s, v6.4s, v10.4s\n"
- "ldr q10, [x7, x17]\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v30.4s, v7.4s, v12.4s\n"
- "ldr q9, [x12, x4]\n"
- "fmla v17.4s, v8.4s, v11.4s\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "fmla v16.4s, v5.4s, v11.4s\n"
- "fmla v24.4s, v4.4s, v11.4s\n"
- "fmla v23.4s, v2.4s, v11.4s\n"
- "fmla v19.4s, v1.4s, v11.4s\n"
- "ldr q12, [x7, x11]\n"
- "add x7, x7, #0x10\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+ "add x3, x3, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+ "cmp x3, x6, LSL #4\n"
+ "add x8, x8, #0x10\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v7.4s, v9.4s\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v5.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x9, x15]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ld1 { v26.4s }, [x25]\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x25, x12]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "fmla v22.4s, v8.4s, v12.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v6.4s, v26.4s\n"
+ "ldr q11, [x9, x14]\n"
"fmla v31.4s, v7.4s, v9.4s\n"
- "fmla v26.4s, v6.4s, v9.4s\n"
- "fmla v20.4s, v4.4s, v9.4s\n"
- "fmla v22.4s, v3.4s, v9.4s\n"
- "fmla v21.4s, v1.4s, v9.4s\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "ldr q9, [x12, x28]\n"
- "fmla v28.4s, v2.4s, v10.4s\n"
- "fmla v25.4s, v1.4s, v10.4s\n"
- "fmla v17.4s, v0.4s, v10.4s\n"
- "ld1 { v10.4s }, [x14]\n"
- "fmla v18.4s, v2.4s, v9.4s\n"
- "fmla v29.4s, v0.4s, v12.4s\n"
- "fmla v31.4s, v3.4s, v10.4s\n"
- "fmla v20.4s, v0.4s, v10.4s\n"
- "fmla v16.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v7.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v9.4s\n"
- "fmla v19.4s, v4.4s, v9.4s\n"
- "fmla v30.4s, v1.4s, v9.4s\n"
- "ldr q11, [x9, x17]\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
- "fmla v17.4s, v1.4s, v12.4s\n"
- "ldr q12, [x14, x25]\n"
- "add x14, x14, #0x10\n"
- "ldr q9, [x14, x17]\n"
- "fmla v28.4s, v6.4s, v10.4s\n"
- "ld1 { v10.4s }, [x12]\n"
- "fmla v27.4s, v4.4s, v11.4s\n"
- "fmla v18.4s, v3.4s, v11.4s\n"
- "fmla v29.4s, v8.4s, v12.4s\n"
- "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v18.4s, v6.4s, v12.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v3.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "ldr q10, [x7, x4]\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v17.4s\n"
+ "ldr q12, [x7, x13]\n"
+ "fmla v23.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmla v21.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "ldr q13, [x16, #0x0]\n"
+ "fmla v29.4s, v8.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v9.4s\n"
+ "fmla v20.4s, v2.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x11]\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x11, x12]\n"
"fmla v19.4s, v2.4s, v12.4s\n"
- "ldr q12, [x12, x25]\n"
- "add x12, x12, #0x10\n"
- "fmla v31.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v3.4s, v10.4s\n"
- "fmla v21.4s, v0.4s, v10.4s\n"
- "ldr q10, [x26, x17]\n"
- "fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v27.4s, v7.4s, v10.4s\n"
- "fmla v18.4s, v6.4s, v10.4s\n"
- "fmla v20.4s, v8.4s, v11.4s\n"
- "fmla v22.4s, v7.4s, v11.4s\n"
- "fmla v23.4s, v6.4s, v11.4s\n"
- "fmla v21.4s, v5.4s, v11.4s\n"
- "ldr q11, [x9, x11]\n"
- "fmla v19.4s, v5.4s, v12.4s\n"
- "fmla v27.4s, v5.4s, v11.4s\n"
- "fmla v18.4s, v4.4s, v11.4s\n"
- "fmla v30.4s, v3.4s, v11.4s\n"
- "fmla v24.4s, v8.4s, v12.4s\n"
- "ldr q12, [x26, x11]\n"
- "fmla v21.4s, v8.4s, v10.4s\n"
- "ldr q10, [x15, x4]\n"
- "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x27]\n"
"fmla v23.4s, v7.4s, v11.4s\n"
- "add x26, x26, #0x10\n"
- "fmla v19.4s, v6.4s, v11.4s\n"
- "ldr q11, [x15, x28]\n"
- "fmla v27.4s, v8.4s, v12.4s\n"
- "add x15, x15, #0x10\n"
- "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "fmla v17.4s, v1.4s, v11.4s\n"
+ "fmla v16.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x11, x15]\n"
+ "fmla v29.4s, v0.4s, v9.4s\n"
"fmla v30.4s, v6.4s, v12.4s\n"
- "ldr q12, [x9, x4]\n"
- "fmla v28.4s, v4.4s, v10.4s\n"
- "fmla v25.4s, v3.4s, v10.4s\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x27, x12]\n"
+ "fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "fmla v18.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v2.4s, v10.4s\n"
+ "ldr q10, [x11, x14]\n"
+ "fmla v22.4s, v4.4s, v11.4s\n"
+ "fmla v19.4s, v3.4s, v11.4s\n"
+ "fmla v23.4s, v0.4s, v11.4s\n"
+ "fmla v27.4s, v8.4s, v12.4s\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "ldr q9, [x25, x4]\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "ldr q12, [x10, x4]\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v19.4s, v4.4s, v10.4s\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
"fmla v26.4s, v0.4s, v10.4s\n"
- "ldr q10, [x9, x28]\n"
- "ldr q0, [x16, #0x10]\n"
- "fmla v17.4s, v5.4s, v11.4s\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
+ "ldr q11, [x10, x13]\n"
+ "fmla v20.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x25, x13]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v0.4s, v12.4s\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q10, [x7, x15]\n"
+ "fmla v17.4s, v8.4s, v9.4s\n"
+ "fmla v16.4s, v7.4s, v9.4s\n"
+ "ldr q9, [x9, x4]\n"
+ "fmla v19.4s, v8.4s, v11.4s\n"
+ "fmla v18.4s, v7.4s, v11.4s\n"
+ "fmla v23.4s, v5.4s, v11.4s\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v11.4s\n"
+ "fmla v27.4s, v1.4s, v11.4s\n"
+ "ldr q12, [x7, x14]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v29.4s, v7.4s, v9.4s\n"
+ "fmla v31.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v4.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v24.4s, v2.4s, v10.4s\n"
+ "fmla v22.4s, v1.4s, v10.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x10]\n"
+ "fmla v18.4s, v0.4s, v12.4s\n"
+ "fmla v17.4s, v2.4s, v9.4s\n"
+ "fmla v23.4s, v8.4s, v9.4s\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v9.4s\n"
+ "fmla v29.4s, v3.4s, v10.4s\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
+ "fmla v27.4s, v4.4s, v9.4s\n"
+ "fmla v16.4s, v1.4s, v9.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v19.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x10, x12]\n"
+ "add x10, x10, #0x10\n"
+ "ldr q9, [x10, x15]\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x9]\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "fmla v17.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x9, x12]\n"
+ "fmla v29.4s, v6.4s, v10.4s\n"
"add x9, x9, #0x10\n"
- "fmla v16.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x25, x15]\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v8.4s, v12.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "ldr q12, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v20.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v25.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v16.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x11, x13]\n"
+ "fmla v20.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x11, x4]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v25.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v16.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x27, x4]\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v10.4s\n"
+ "fmla v22.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
"ldr q2, [x16, #0x30]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q11, [x7, x25]\n"
+ "fmla v26.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x7, x12]\n"
"ldr q1, [x16, #0x20]\n"
- "fmla v20.4s, v7.4s, v12.4s\n"
- "fmla v22.4s, v6.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
"ldr q6, [x16, #0x70]\n"
- "fmla v21.4s, v4.4s, v12.4s\n"
- "fmla v27.4s, v3.4s, v12.4s\n"
- "ldr q12, [x14, x11]\n"
+ "fmla v20.4s, v4.4s, v12.4s\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x10, x14]\n"
"ldr q3, [x16, #0x40]\n"
- "fmla v23.4s, v8.4s, v10.4s\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
"ldr q8, [x16, #0x90]\n"
- "fmla v19.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
"ldr q7, [x16, #0x80]\n"
- "fmla v18.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v5.4s, v10.4s\n"
"ldr q5, [x16, #0x60]\n"
- "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
"ld1 { v10.4s }, [x7]\n"
"ldr q4, [x16, #0x50]\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v14.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
"add x16, x16, #0xa0\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "fmax v16.4s, v16.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v13.4s\n"
- "fmax v20.4s, v20.4s, v13.4s\n"
- "fmax v22.4s, v22.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v13.4s\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
- "fmax v21.4s, v21.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v13.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "st1 { v28.4s }, [x8]\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "str q25, [x8, x5]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "str q17, [x8, x23]\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v14.4s\n"
+ "fmax v23.4s, v23.4s, v14.4s\n"
+ "fmax v26.4s, v26.4s, v14.4s\n"
+ "fmax v30.4s, v30.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v14.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmax v27.4s, v27.4s, v14.4s\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmax v25.4s, v25.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
"fmin v24.4s, v24.4s, v15.4s\n"
- "str q29, [x8, x22]\n"
- "add x8, x8, #0x10\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
"fmin v22.4s, v22.4s, v15.4s\n"
- "st1 { v31.4s }, [x10]\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
"fmin v19.4s, v19.4s, v15.4s\n"
- "str q26, [x10, x5]\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q16, [x10, x23]\n"
"fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "st1 { v24.4s }, [x17]\n"
"fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [x10, x22]\n"
- "add x10, x10, #0x10\n"
- "st1 { v20.4s }, [x27]\n"
- "str q22, [x27, x5]\n"
- "str q23, [x27, x23]\n"
- "str q19, [x27, x22]\n"
- "add x27, x27, #0x10\n"
- "st1 { v21.4s }, [x24]\n"
- "str q27, [x24, x5]\n"
- "str q18, [x24, x23]\n"
- "str q30, [x24, x22]\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q22, [x17, x5]\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "str q19, [x17, x21]\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "str q18, [x17, x20]\n"
+ "add x17, x17, #0x10\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "st1 { v29.4s }, [x28]\n"
+ "str q31, [x28, x5]\n"
+ "str q23, [x28, x21]\n"
+ "str q26, [x28, x20]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v30.4s }, [x26]\n"
+ "str q28, [x26, x5]\n"
+ "str q21, [x26, x21]\n"
+ "str q27, [x26, x20]\n"
+ "add x26, x26, #0x10\n"
+ "st1 { v20.4s }, [x24]\n"
+ "str q25, [x24, x5]\n"
+ "str q17, [x24, x21]\n"
+ "str q16, [x24, x20]\n"
"add x24, x24, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
- "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
- "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
- "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
- "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
- "fmla v16.4s, v5.4s, v12.4s\n"
- "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
- "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
- "ldr q24, [x12, x17]\n"
- "fmla v23.4s, v0.4s, v10.4s\n"
- "ld1 { v21.4s }, [x26]\n"
- "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
- "ldr q20, [x26, x25]\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x9, x15]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ld1 { v22.4s }, [x25]\n"
+ "mov v10.16b, v13.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+ "ldr q16, [x25, x12]\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
- "fmla v18.4s, v1.4s, v12.4s\n"
- "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
- "ldr q9, [x12, x11]\n"
- "fmla v16.4s, v7.4s, v24.4s\n"
- "fmla v17.4s, v8.4s, v12.4s\n"
- "fmla v19.4s, v7.4s, v12.4s\n"
- "fmla v29.4s, v6.4s, v12.4s\n"
- "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
- "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
- "ldr q22, [x7, x4]\n"
- "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
- "ldr q21, [x7, x28]\n"
- "fmla v31.4s, v6.4s, v24.4s\n"
+ "fmla v19.4s, v1.4s, v12.4s\n"
+ "fmla v20.4s, v8.4s, v12.4s\n"
+ "mov v9.16b, v13.16b\n fmla v9.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x9, x14]\n"
+ "fmla v31.4s, v7.4s, v24.4s\n"
+ "fmla v21.4s, v7.4s, v12.4s\n"
+ "fmla v10.4s, v6.4s, v12.4s\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v3.4s, v12.4s\n"
+ "mov v11.16b, v13.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+ "ldr q23, [x7, x4]\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x7, x13]\n"
+ "fmla v29.4s, v6.4s, v24.4s\n"
"fmla v30.4s, v4.4s, v24.4s\n"
- "fmla v18.4s, v3.4s, v24.4s\n"
- "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
- "fmla v14.4s, v0.4s, v24.4s\n"
- "fmla v28.4s, v8.4s, v24.4s\n"
+ "fmla v19.4s, v3.4s, v24.4s\n"
+ "mov v12.16b, v13.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+ "fmla v18.4s, v8.4s, v24.4s\n"
"fmla v27.4s, v5.4s, v24.4s\n"
- "fmla v26.4s, v2.4s, v24.4s\n"
- "ld1 { v24.4s }, [x15]\n"
- "fmla v16.4s, v8.4s, v9.4s\n"
- "fmla v23.4s, v1.4s, v22.4s\n"
- "fmla v17.4s, v0.4s, v22.4s\n"
- "ldr q22, [x15, x25]\n"
- "fmla v19.4s, v2.4s, v21.4s\n"
- "fmla v29.4s, v1.4s, v21.4s\n"
- "ld1 { v20.4s }, [x9]\n"
- "fmla v31.4s, v7.4s, v9.4s\n"
- "fmla v11.4s, v6.4s, v9.4s\n"
- "fmla v30.4s, v5.4s, v9.4s\n"
- "fmla v18.4s, v4.4s, v9.4s\n"
- "fmla v10.4s, v3.4s, v9.4s\n"
- "fmla v12.4s, v2.4s, v9.4s\n"
- "fmla v14.4s, v1.4s, v9.4s\n"
- "fmla v25.4s, v0.4s, v9.4s\n"
- "ldr q21, [x15, x17]\n"
- "fmla v28.4s, v0.4s, v24.4s\n"
- "fmla v27.4s, v6.4s, v20.4s\n"
- "fmla v26.4s, v3.4s, v20.4s\n"
- "ldr q20, [x9, x25]\n"
- "fmla v16.4s, v1.4s, v21.4s\n"
- "fmla v23.4s, v3.4s, v24.4s\n"
- "fmla v29.4s, v5.4s, v22.4s\n"
- "fmla v11.4s, v2.4s, v22.4s\n"
- "ldr q22, [x15, x11]\n"
- "fmla v17.4s, v4.4s, v21.4s\n"
- "fmla v19.4s, v3.4s, v21.4s\n"
- "fmla v31.4s, v0.4s, v21.4s\n"
- "fmla v10.4s, v8.4s, v20.4s\n"
- "fmla v25.4s, v5.4s, v20.4s\n"
- "ldr q20, [x26, x4]\n"
- "fmla v28.4s, v2.4s, v21.4s\n"
- "fmla v16.4s, v2.4s, v22.4s\n"
- "fmla v23.4s, v5.4s, v21.4s\n"
- "ldr q21, [x14, x4]\n"
- "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v9.4s, v2.4s, v24.4s\n"
+ "ld1 { v24.4s }, [x11]\n"
+ "fmla v31.4s, v8.4s, v22.4s\n"
+ "fmla v17.4s, v1.4s, v23.4s\n"
+ "fmla v20.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x11, x12]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v10.4s, v1.4s, v16.4s\n"
+ "ld1 { v16.4s }, [x27]\n"
+ "fmla v29.4s, v7.4s, v22.4s\n"
+ "fmla v28.4s, v6.4s, v22.4s\n"
+ "fmla v30.4s, v5.4s, v22.4s\n"
"fmla v19.4s, v4.4s, v22.4s\n"
- "fmla v29.4s, v3.4s, v22.4s\n"
+ "fmla v11.4s, v3.4s, v22.4s\n"
+ "fmla v12.4s, v2.4s, v22.4s\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "fmla v26.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x11, x15]\n"
+ "fmla v18.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
+ "fmla v9.4s, v3.4s, v16.4s\n"
+ "ldr q16, [x27, x12]\n"
+ "fmla v17.4s, v3.4s, v24.4s\n"
"fmla v31.4s, v1.4s, v22.4s\n"
- "fmla v11.4s, v0.4s, v22.4s\n"
- "ldr q22, [x14, x28]\n"
- "fmla v26.4s, v7.4s, v20.4s\n"
- "fmla v12.4s, v6.4s, v20.4s\n"
- "ldr q20, [x26, x28]\n"
- "fmla v28.4s, v4.4s, v21.4s\n"
- "fmla v16.4s, v3.4s, v21.4s\n"
- "fmla v27.4s, v1.4s, v21.4s\n"
- "fmla v30.4s, v0.4s, v21.4s\n"
- "fmla v23.4s, v7.4s, v21.4s\n"
- "fmla v17.4s, v6.4s, v21.4s\n"
- "ldr q21, [x7, x17]\n"
- "fmla v14.4s, v8.4s, v20.4s\n"
- "fmla v25.4s, v7.4s, v20.4s\n"
- "ldr q20, [x12, x4]\n"
- "fmla v19.4s, v8.4s, v22.4s\n"
- "fmla v29.4s, v7.4s, v22.4s\n"
- "fmla v31.4s, v5.4s, v22.4s\n"
- "fmla v11.4s, v4.4s, v22.4s\n"
+ "fmla v10.4s, v5.4s, v23.4s\n"
+ "fmla v28.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x11, x14]\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v21.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v11.4s, v8.4s, v16.4s\n"
+ "fmla v26.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x25, x4]\n"
"fmla v18.4s, v2.4s, v22.4s\n"
- "fmla v10.4s, v1.4s, v22.4s\n"
- "ldr q22, [x7, x11]\n"
+ "fmla v31.4s, v2.4s, v23.4s\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "ldr q22, [x10, x4]\n"
+ "fmla v20.4s, v5.4s, v23.4s\n"
+ "fmla v21.4s, v4.4s, v23.4s\n"
+ "fmla v10.4s, v3.4s, v23.4s\n"
+ "fmla v29.4s, v1.4s, v23.4s\n"
+ "fmla v28.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x10, x13]\n"
+ "fmla v9.4s, v7.4s, v16.4s\n"
+ "fmla v12.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x25, x13]\n"
+ "fmla v18.4s, v4.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v30.4s, v0.4s, v22.4s\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v20.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x7, x15]\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "fmla v26.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x9, x4]\n"
+ "fmla v21.4s, v8.4s, v23.4s\n"
+ "fmla v10.4s, v7.4s, v23.4s\n"
+ "fmla v29.4s, v5.4s, v23.4s\n"
+ "fmla v28.4s, v4.4s, v23.4s\n"
+ "fmla v19.4s, v2.4s, v23.4s\n"
+ "fmla v11.4s, v1.4s, v23.4s\n"
+ "ldr q23, [x7, x14]\n"
"add x7, x7, #0x10\n"
- "fmla v28.4s, v7.4s, v20.4s\n"
- "fmla v16.4s, v6.4s, v20.4s\n"
- "fmla v27.4s, v4.4s, v20.4s\n"
- "fmla v30.4s, v3.4s, v20.4s\n"
- "fmla v26.4s, v1.4s, v20.4s\n"
- "fmla v12.4s, v0.4s, v20.4s\n"
- "ldr q20, [x12, x28]\n"
- "fmla v23.4s, v2.4s, v21.4s\n"
- "fmla v17.4s, v1.4s, v21.4s\n"
- "fmla v19.4s, v0.4s, v21.4s\n"
- "ld1 { v21.4s }, [x14]\n"
- "fmla v14.4s, v2.4s, v20.4s\n"
- "fmla v29.4s, v0.4s, v22.4s\n"
- "fmla v28.4s, v3.4s, v21.4s\n"
- "fmla v27.4s, v0.4s, v21.4s\n"
- "fmla v31.4s, v8.4s, v20.4s\n"
- "fmla v11.4s, v7.4s, v20.4s\n"
- "fmla v18.4s, v5.4s, v20.4s\n"
- "fmla v10.4s, v4.4s, v20.4s\n"
- "fmla v25.4s, v1.4s, v20.4s\n"
- "ldr q24, [x9, x17]\n"
+ "fmla v18.4s, v7.4s, v16.4s\n"
+ "fmla v31.4s, v6.4s, v16.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v30.4s, v3.4s, v16.4s\n"
+ "fmla v9.4s, v1.4s, v16.4s\n"
+ "fmla v12.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x13]\n"
"fmla v17.4s, v2.4s, v22.4s\n"
- "fmla v19.4s, v1.4s, v22.4s\n"
- "ldr q20, [x14, x25]\n"
- "add x14, x14, #0x10\n"
- "fmla v23.4s, v6.4s, v21.4s\n"
- "ld1 { v21.4s }, [x12]\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v0.4s, v22.4s\n"
+ "ld1 { v22.4s }, [x10]\n"
+ "fmla v10.4s, v0.4s, v23.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v8.4s, v16.4s\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "fmla v18.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v22.4s\n"
+ "fmla v11.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v1.4s, v16.4s\n"
+ "ldr q24, [x27, x15]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "ldr q16, [x10, x12]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "ld1 { v22.4s }, [x9]\n"
"fmla v12.4s, v4.4s, v24.4s\n"
- "fmla v14.4s, v3.4s, v24.4s\n"
- "fmla v29.4s, v8.4s, v20.4s\n"
- "fmla v11.4s, v5.4s, v20.4s\n"
- "fmla v10.4s, v2.4s, v20.4s\n"
- "ldr q20, [x12, x25]\n"
- "add x12, x12, #0x10\n"
- "fmla v28.4s, v6.4s, v21.4s\n"
- "fmla v27.4s, v3.4s, v21.4s\n"
- "fmla v26.4s, v0.4s, v21.4s\n"
- "ldr q22, [x26, x17]\n"
- "fmla v25.4s, v2.4s, v20.4s\n"
- "fmla v12.4s, v7.4s, v22.4s\n"
- "fmla v14.4s, v6.4s, v22.4s\n"
- "fmla v27.4s, v8.4s, v24.4s\n"
+ "fmla v25.4s, v3.4s, v24.4s\n"
"fmla v30.4s, v7.4s, v24.4s\n"
- "fmla v18.4s, v6.4s, v24.4s\n"
- "fmla v26.4s, v5.4s, v24.4s\n"
- "ldr q21, [x9, x11]\n"
- "fmla v10.4s, v5.4s, v20.4s\n"
- "fmla v12.4s, v5.4s, v21.4s\n"
- "fmla v14.4s, v4.4s, v21.4s\n"
- "fmla v25.4s, v3.4s, v21.4s\n"
- "fmla v11.4s, v8.4s, v20.4s\n"
- "ldr q20, [x26, x11]\n"
- "fmla v26.4s, v8.4s, v22.4s\n"
- "ldr q9, [x15, x4]\n"
- "fmla v30.4s, v8.4s, v21.4s\n"
- "fmla v18.4s, v7.4s, v21.4s\n"
- "add x26, x26, #0x10\n"
- "fmla v10.4s, v6.4s, v21.4s\n"
- "ldr q21, [x15, x28]\n"
- "fmla v12.4s, v8.4s, v20.4s\n"
- "add x15, x15, #0x10\n"
- "fmla v14.4s, v7.4s, v20.4s\n"
- "fmla v25.4s, v6.4s, v20.4s\n"
- "ldr q24, [x9, x4]\n"
- "fmla v23.4s, v4.4s, v9.4s\n"
- "fmla v17.4s, v3.4s, v9.4s\n"
- "fmax v23.4s, v23.4s, v13.4s\n"
- "fmla v28.4s, v1.4s, v9.4s\n"
- "fmla v16.4s, v0.4s, v9.4s\n"
- "ldr q0, [x9, x28]\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmla v19.4s, v5.4s, v21.4s\n"
- "fmla v29.4s, v4.4s, v21.4s\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmla v10.4s, v8.4s, v16.4s\n"
+ "fmla v28.4s, v5.4s, v16.4s\n"
+ "fmla v11.4s, v2.4s, v16.4s\n"
+ "ldr q16, [x9, x12]\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
"add x9, x9, #0x10\n"
- "fmla v31.4s, v2.4s, v21.4s\n"
- "fmla v11.4s, v1.4s, v21.4s\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
- "fmla v27.4s, v7.4s, v24.4s\n"
- "fmla v30.4s, v6.4s, v24.4s\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmla v26.4s, v4.4s, v24.4s\n"
- "fmla v12.4s, v3.4s, v24.4s\n"
- "fmax v16.4s, v16.4s, v13.4s\n"
- "fmla v18.4s, v8.4s, v0.4s\n"
- "fmla v10.4s, v7.4s, v0.4s\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
- "fmla v14.4s, v5.4s, v0.4s\n"
- "fmla v25.4s, v4.4s, v0.4s\n"
- "fmax v11.4s, v11.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v13.4s\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v10.4s, v10.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "fmax v12.4s, v12.4s, v13.4s\n"
- "fmax v14.4s, v14.4s, v13.4s\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v9.4s, v0.4s, v22.4s\n"
+ "ldr q23, [x25, x15]\n"
+ "fmla v19.4s, v6.4s, v24.4s\n"
+ "fmla v26.4s, v2.4s, v16.4s\n"
+ "fmla v28.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v6.4s, v23.4s\n"
+ "fmla v11.4s, v5.4s, v16.4s\n"
+ "ldr q22, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v27.4s, v8.4s, v24.4s\n"
+ "fmla v9.4s, v5.4s, v24.4s\n"
+ "ldr q16, [x27, x14]\n"
+ "fmla v12.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v16.4s\n"
+ "fmla v26.4s, v3.4s, v16.4s\n"
+ "fmla v30.4s, v8.4s, v16.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmla v11.4s, v6.4s, v16.4s\n"
+ "ldr q24, [x11, x13]\n"
+ "fmla v9.4s, v8.4s, v23.4s\n"
+ "ldr q16, [x11, x4]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v12.4s, v8.4s, v22.4s\n"
+ "fmla v25.4s, v7.4s, v22.4s\n"
+ "fmla v26.4s, v6.4s, v22.4s\n"
+ "ldr q23, [x27, x4]\n"
+ "fmla v21.4s, v5.4s, v24.4s\n"
+ "fmla v10.4s, v4.4s, v24.4s\n"
+ "fmla v17.4s, v4.4s, v16.4s\n"
+ "fmla v20.4s, v3.4s, v16.4s\n"
+ "fmla v18.4s, v1.4s, v16.4s\n"
+ "fmla v31.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x27, x13]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v29.4s, v2.4s, v24.4s\n"
+ "fmla v28.4s, v1.4s, v24.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v30.4s, v6.4s, v23.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmla v9.4s, v4.4s, v23.4s\n"
+ "fmla v12.4s, v3.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "fmla v11.4s, v7.4s, v16.4s\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "fmla v26.4s, v4.4s, v16.4s\n"
+ "fmax v10.4s, v10.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v14.4s\n"
+ "fmax v27.4s, v27.4s, v14.4s\n"
+ "fmax v30.4s, v30.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmax v11.4s, v11.4s, v14.4s\n"
+ "fmax v9.4s, v9.4s, v14.4s\n"
+ "fmax v12.4s, v12.4s, v14.4s\n"
+ "fmax v25.4s, v25.4s, v14.4s\n"
+ "fmax v26.4s, v26.4s, v14.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
- "st1 { v23.4s }, [x8]\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v10.4s, v10.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
- "str q17, [x8, x5]\n"
"fmin v28.4s, v28.4s, v15.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "str q19, [x8, x23]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "fmin v11.4s, v11.4s, v15.4s\n"
- "str q29, [x8, x22]\n"
- "add x8, x8, #0x10\n"
+ "st1 { v17.4s }, [x17]\n"
"fmin v27.4s, v27.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
- "st1 { v28.4s }, [x10]\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v10.4s, v10.4s, v15.4s\n"
- "str q16, [x10, x5]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
+ "str q20, [x17, x5]\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v11.4s, v11.4s, v15.4s\n"
+ "str q21, [x17, x21]\n"
+ "fmin v9.4s, v9.4s, v15.4s\n"
"fmin v12.4s, v12.4s, v15.4s\n"
- "str q31, [x10, x23]\n"
- "fmin v14.4s, v14.4s, v15.4s\n"
+ "str q10, [x17, x20]\n"
+ "add x17, x17, #0x10\n"
"fmin v25.4s, v25.4s, v15.4s\n"
- "str q11, [x10, x22]\n"
- "add x10, x10, #0x10\n"
- "st1 { v27.4s }, [x27]\n"
- "str q30, [x27, x5]\n"
- "str q18, [x27, x23]\n"
- "str q10, [x27, x22]\n"
- "add x27, x27, #0x10\n"
- "st1 { v26.4s }, [x24]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "st1 { v18.4s }, [x28]\n"
+ "str q31, [x28, x5]\n"
+ "str q29, [x28, x21]\n"
+ "str q28, [x28, x20]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v27.4s }, [x26]\n"
+ "str q30, [x26, x5]\n"
+ "str q19, [x26, x21]\n"
+ "str q11, [x26, x20]\n"
+ "add x26, x26, #0x10\n"
+ "st1 { v9.4s }, [x24]\n"
"str q12, [x24, x5]\n"
- "str q14, [x24, x23]\n"
- "str q25, [x24, x22]\n"
+ "str q25, [x24, x21]\n"
+ "str q26, [x24, x20]\n"
"add x24, x24, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 73f\n"
- "ldr q14, [x16, #0x0]\n"
+ "ldr q13, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "add x23, x14, x17\n"
+ "add x23, x10, x15\n"
"add x22, x7, XZR\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x21, x7, x25\n"
- "add x20, x14, x11\n"
+ "add x21, x7, x12\n"
+ "add x20, x10, x14\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"ldr q5, [x16, #0x60]\n"
@@ -675,27 +675,27 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ldr s11, [x21, #0x0]\n"
"ldr s12, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
- "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
- "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
- "add x20, x26, XZR\n"
- "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
- "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
- "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
- "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
- "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
- "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
- "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
- "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v13.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "add x20, x25, XZR\n"
+ "mov v18.16b, v13.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v13.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v13.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v13.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v13.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v13.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v13.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v13.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
- "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "mov v23.16b, v13.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
"fmla v26.4s, v1.4s, v12.4s\n"
- "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "mov v27.16b, v13.16b\n fmla v27.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -704,8 +704,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
"ldr s10, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
- "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
- "add x20, x26, x25\n"
+ "mov v28.16b, v13.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x25, x12\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
@@ -714,8 +714,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
- "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
- "add x20, x12, x17\n"
+ "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x9, x15\n"
"tbz %x[n_channels], #1, 11f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
@@ -732,8 +732,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
- "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
- "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v13.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v13.16b\n fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -744,7 +744,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
"fmla v16.4s, v1.4s, v12.4s\n"
"fmla v17.4s, v0.4s, v12.4s\n"
- "add x20, x7, x28\n"
+ "add x20, x7, x13\n"
"tbz %x[n_channels], #1, 15f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
@@ -755,7 +755,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
"fmla v18.4s, v2.4s, v11.4s\n"
"fmla v19.4s, v1.4s, v11.4s\n"
- "add x20, x12, x11\n"
+ "add x20, x9, x14\n"
"tbz %x[n_channels], #1, 17f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
@@ -766,7 +766,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
"fmla v21.4s, v8.4s, v10.4s\n"
"fmla v22.4s, v7.4s, v10.4s\n"
- "add x20, x15, XZR\n"
+ "add x20, x11, XZR\n"
"fmla v23.4s, v6.4s, v10.4s\n"
"fmla v25.4s, v5.4s, v10.4s\n"
"fmla v26.4s, v4.4s, v10.4s\n"
@@ -784,7 +784,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
"fmla v16.4s, v3.4s, v9.4s\n"
"fmla v20.4s, v0.4s, v9.4s\n"
- "add x20, x15, x25\n"
+ "add x20, x11, x12\n"
"tbz %x[n_channels], #1, 21f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
@@ -795,7 +795,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
"fmla v19.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v2.4s, v12.4s\n"
- "add x20, x9, XZR\n"
+ "add x20, x27, XZR\n"
"tbz %x[n_channels], #1, 23f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
@@ -806,7 +806,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
"fmla v24.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x15, x17\n"
+ "add x20, x11, x15\n"
"tbz %x[n_channels], #1, 25f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
@@ -817,7 +817,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
"fmla v16.4s, v5.4s, v10.4s\n"
"fmla v17.4s, v4.4s, v10.4s\n"
- "add x20, x9, x25\n"
+ "add x20, x27, x12\n"
"fmla v18.4s, v3.4s, v10.4s\n"
"fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v1.4s, v10.4s\n"
@@ -832,7 +832,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
"fmla v27.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v5.4s, v11.4s\n"
- "add x20, x15, x11\n"
+ "add x20, x11, x14\n"
"tbz %x[n_channels], #1, 29f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
@@ -843,7 +843,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v17.4s, v5.4s, v12.4s\n"
"fmla v18.4s, v4.4s, v12.4s\n"
- "add x20, x26, x4\n"
+ "add x20, x25, x4\n"
"fmla v19.4s, v3.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
"fmla v22.4s, v1.4s, v12.4s\n"
@@ -858,7 +858,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
"fmla v28.4s, v7.4s, v11.4s\n"
"fmla v29.4s, v6.4s, v11.4s\n"
- "add x20, x14, x4\n"
+ "add x20, x10, x4\n"
"tbz %x[n_channels], #1, 33f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
@@ -869,7 +869,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
"fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "add x20, x26, x28\n"
+ "add x20, x25, x13\n"
"fmla v20.4s, v4.4s, v10.4s\n"
"fmla v21.4s, v3.4s, v10.4s\n"
"fmla v24.4s, v1.4s, v10.4s\n"
@@ -884,7 +884,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
"fmla v30.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v7.4s, v11.4s\n"
- "add x20, x14, x28\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #1, 37f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
@@ -895,7 +895,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
"fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "add x20, x7, x17\n"
+ "add x20, x7, x15\n"
"fmla v22.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v26.4s, v2.4s, v12.4s\n"
@@ -910,7 +910,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
"fmla v16.4s, v2.4s, v10.4s\n"
"fmla v17.4s, v1.4s, v10.4s\n"
- "add x20, x12, x4\n"
+ "add x20, x9, x4\n"
"fmla v18.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 41f\n"
"ldr d11, [x20], #0x8\n"
@@ -922,7 +922,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
"fmla v20.4s, v7.4s, v11.4s\n"
"fmla v21.4s, v6.4s, v11.4s\n"
- "add x20, x7, x11\n"
+ "add x20, x7, x14\n"
"fmla v24.4s, v4.4s, v11.4s\n"
"fmla v25.4s, v3.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
@@ -937,7 +937,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
"fmla v17.4s, v2.4s, v12.4s\n"
"fmla v18.4s, v1.4s, v12.4s\n"
- "add x20, x14, XZR\n"
+ "add x20, x10, XZR\n"
"fmla v19.4s, v0.4s, v12.4s\n"
"tbz %x[n_channels], #1, 45f\n"
"ldr d10, [x20], #0x8\n"
@@ -949,7 +949,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
"fmla v16.4s, v6.4s, v10.4s\n"
"fmla v20.4s, v3.4s, v10.4s\n"
- "add x20, x12, x28\n"
+ "add x20, x9, x13\n"
"fmla v24.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 47f\n"
"ldr d11, [x20], #0x8\n"
@@ -961,7 +961,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
"fmla v22.4s, v8.4s, v11.4s\n"
"fmla v23.4s, v7.4s, v11.4s\n"
- "add x20, x14, x25\n"
+ "add x20, x10, x12\n"
"fmla v26.4s, v5.4s, v11.4s\n"
"fmla v27.4s, v4.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v11.4s\n"
@@ -976,7 +976,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
"fmla v19.4s, v8.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v12.4s\n"
- "add x20, x12, XZR\n"
+ "add x20, x9, XZR\n"
"fmla v27.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 51f\n"
"ldr d10, [x20], #0x8\n"
@@ -988,7 +988,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v20.4s, v6.4s, v10.4s\n"
"fmla v24.4s, v3.4s, v10.4s\n"
- "add x20, x9, x17\n"
+ "add x20, x27, x15\n"
"fmla v28.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 53f\n"
"ldr d11, [x20], #0x8\n"
@@ -1000,7 +1000,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
"fmla v24.4s, v8.4s, v11.4s\n"
"fmla v25.4s, v7.4s, v11.4s\n"
- "add x20, x12, x25\n"
+ "add x20, x9, x12\n"
"fmla v26.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
"fmla v29.4s, v4.4s, v11.4s\n"
@@ -1015,7 +1015,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
"fmla v23.4s, v8.4s, v12.4s\n"
"fmla v27.4s, v5.4s, v12.4s\n"
- "add x20, x26, x17\n"
+ "add x20, x25, x15\n"
"fmla v31.4s, v2.4s, v12.4s\n"
"tbz %x[n_channels], #1, 57f\n"
"ldr d10, [x20], #0x8\n"
@@ -1027,7 +1027,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x9, x11\n"
+ "add x20, x27, x14\n"
"fmla v30.4s, v6.4s, v10.4s\n"
"tbz %x[n_channels], #1, 59f\n"
"ldr d11, [x20], #0x8\n"
@@ -1039,7 +1039,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v26.4s, v7.4s, v11.4s\n"
- "add x20, x26, x11\n"
+ "add x20, x25, x14\n"
"fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
"fmla v30.4s, v4.4s, v11.4s\n"
@@ -1054,7 +1054,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
"fmla v29.4s, v8.4s, v12.4s\n"
"fmla v30.4s, v7.4s, v12.4s\n"
- "add x20, x15, x4\n"
+ "add x20, x11, x4\n"
"fmla v31.4s, v6.4s, v12.4s\n"
"tbz %x[n_channels], #1, 63f\n"
"ldr d10, [x20], #0x8\n"
@@ -1066,7 +1066,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
"fmla v16.4s, v4.4s, v10.4s\n"
"fmla v17.4s, v3.4s, v10.4s\n"
- "add x20, x15, x28\n"
+ "add x20, x11, x13\n"
"fmla v20.4s, v1.4s, v10.4s\n"
"fmla v21.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 65f\n"
@@ -1079,7 +1079,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
"fmla v18.4s, v5.4s, v11.4s\n"
"fmla v19.4s, v4.4s, v11.4s\n"
- "add x20, x9, x4\n"
+ "add x20, x27, x4\n"
"fmla v22.4s, v2.4s, v11.4s\n"
"fmla v23.4s, v1.4s, v11.4s\n"
"tbz %x[n_channels], #1, 67f\n"
@@ -1092,7 +1092,7 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
"fmla v24.4s, v7.4s, v12.4s\n"
"fmla v25.4s, v6.4s, v12.4s\n"
- "add x20, x9, x28\n"
+ "add x20, x27, x13\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"tbz %x[n_channels], #1, 69f\n"
@@ -1105,24 +1105,24 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmax v16.4s, v16.4s, v14.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
- "fmax v20.4s, v20.4s, v13.4s\n"
- "fmax v21.4s, v21.4s, v13.4s\n"
- "fmax v22.4s, v22.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v13.4s\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmax v17.4s, v17.4s, v14.4s\n"
+ "fmax v18.4s, v18.4s, v14.4s\n"
+ "fmax v19.4s, v19.4s, v14.4s\n"
+ "fmax v20.4s, v20.4s, v14.4s\n"
+ "fmax v21.4s, v21.4s, v14.4s\n"
+ "fmax v22.4s, v22.4s, v14.4s\n"
+ "fmax v23.4s, v23.4s, v14.4s\n"
+ "fmax v24.4s, v24.4s, v14.4s\n"
+ "fmax v25.4s, v25.4s, v14.4s\n"
+ "fmax v26.4s, v26.4s, v14.4s\n"
+ "fmax v27.4s, v27.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v14.4s\n"
+ "fmax v30.4s, v30.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v14.4s\n"
"fmin v16.4s, v16.4s, v15.4s\n"
"fmin v17.4s, v17.4s, v15.4s\n"
"fmin v18.4s, v18.4s, v15.4s\n"
@@ -1140,18 +1140,18 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 71f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.d }[0], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v16.d }[0], [x23], x5\n"
"st1 { v20.d }[0], [x22], x5\n"
+ "add x26, x26, #0x8\n"
+ "add x24, x24, #0x8\n"
"st1 { v24.d }[0], [x21], x5\n"
- "add x8, x8, #0x8\n"
- "add x10, x10, #0x8\n"
"st1 { v28.d }[0], [x20], x5\n"
- "add x27, x27, #0x8\n"
- "add x24, x24, #0x8\n"
"st1 { v17.d }[0], [x23], x5\n"
"st1 { v21.d }[0], [x22], x5\n"
"st1 { v25.d }[0], [x21], x5\n"
@@ -1165,15 +1165,15 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v27.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 72f\n"
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.s }[2], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.s }[2], [x23], x5\n"
+ "st1 { v17.s }[2], [x23], x5\n"
"st1 { v20.s }[2], [x22], x5\n"
"st1 { v24.s }[2], [x21], x5\n"
"st1 { v28.s }[2], [x20], x5\n"
- "st1 { v17.s }[2], [x23], x5\n"
"st1 { v21.s }[2], [x22], x5\n"
"st1 { v25.s }[2], [x21], x5\n"
"st1 { v29.s }[2], [x20], x5\n"
@@ -1187,15 +1187,15 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v31.s }[2], [x20]\n"
"b 72f\n"
"71:" // Tile loop: Oddments: Store: Bit 1: Unset
- "mov x23, x8\n"
- "mov x22, x10\n"
- "st1 { v16.s }[0], [x23], x5\n"
- "mov x21, x27\n"
+ "mov x23, x17\n"
+ "mov x22, x28\n"
+ "mov x21, x26\n"
"mov x20, x24\n"
+ "st1 { v16.s }[0], [x23], x5\n"
+ "st1 { v17.s }[0], [x23], x5\n"
"st1 { v20.s }[0], [x22], x5\n"
"st1 { v24.s }[0], [x21], x5\n"
"st1 { v28.s }[0], [x20], x5\n"
- "st1 { v17.s }[0], [x23], x5\n"
"st1 { v21.s }[0], [x22], x5\n"
"st1 { v25.s }[0], [x21], x5\n"
"st1 { v29.s }[0], [x20], x5\n"
@@ -1209,20 +1209,20 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"st1 { v31.s }[0], [x20]\n"
"72:" // Tile loop: Oddments: Store: Bit 1: End
"73:" // Tile loop: End
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x26, x26, #0x1\n"
- "add x21, x27, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x27, x27, x21, LT\n"
- "csel x26, x26, XZR, LT\n"
- "cmp x27, x20\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "add x20, x28, #0x1\n"
+ "cmp x27, x22\n"
+ "csel x28, x28, x20, LT\n"
+ "csel x27, x27, XZR, LT\n"
+ "cmp x28, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 76045f30d6..95ed57d48d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,9 +102,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"lsr x7, %x[n_channels], #0x2\n"
"ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x21]\n"
"ld1r { v14.4s }, [x20]\n"
"add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
"mov x15, #0x0\n"
@@ -122,583 +122,583 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q7, [x17, #0x80]\n"
"ldr q8, [x17, #0x90]\n"
"add x17, x17, #0xa0\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "ldr q9, [x21, x15]\n"
- "ldr q10, [x20, x15]\n"
+ "ldp x23, x22, [x16, #0x0]\n"
"ldp x21, x20, [x16, #0x10]\n"
+ "ldr q9, [x23, x15]\n"
+ "ldr q10, [x22, x15]\n"
"ldr q11, [x21, x15]\n"
"ldr q12, [x20, x15]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
- "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v8.4s, v9.4s\n"
"ldr x27, [x16, #0x20]\n"
"ldr x24, [x16, #0x30]\n"
- "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
- "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v3.4s, v9.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v1.4s, v9.4s\n"
"ldr x23, [x16, #0x28]\n"
"ldr x22, [x16, #0x38]\n"
- "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
- "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v0.4s, v9.4s\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v7.4s, v9.4s\n"
"ldr x26, [x16, #0x40]\n"
"ldr x20, [x16, #0x48]\n"
- "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
- "fmla v23.4s, v5.4s, v12.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v6.4s, v9.4s\n"
+ "mov v13.16b, v30.16b\n fmla v13.4s, v5.4s, v9.4s\n"
"ldr x25, [x16, #0x50]\n"
"ldr x21, [x16, #0x58]\n"
- "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
- "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v2.4s, v9.4s\n"
"ldr q9, [x24, x15]\n"
"ldr x13, [x16, #0x70]\n"
- "fmla v17.4s, v0.4s, v10.4s\n"
- "ldr q22, [x27, x15]\n"
- "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
- "ldr q18, [x23, x15]\n"
- "fmla v25.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q17, [x27, x15]\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v2.4s, v11.4s\n"
+ "ldr q29, [x23, x15]\n"
+ "fmla v27.4s, v4.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
"ldr x24, [x16, #0x60]\n"
"ldr x23, [x16, #0x68]\n"
- "fmla v20.4s, v1.4s, v12.4s\n"
- "fmla v16.4s, v8.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
"ldr x12, [x8, #0x0]\n"
"ldr x11, [x8, #0x8]\n"
- "fmla v15.4s, v7.4s, v12.4s\n"
- "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
- "ldr q22, [x20, x15]\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v6.4s, v17.4s\n"
+ "ldr q10, [x20, x15]\n"
"ldr x28, [x16, #0x88]\n"
- "fmla v23.4s, v7.4s, v9.4s\n"
- "fmla v10.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
"ldr x10, [x8, #0x10]\n"
"ldr x9, [x8, #0x18]\n"
- "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+ "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v12.4s\n"
"mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
- "ldr q11, [x22, x15]\n"
+ "ldr q12, [x22, x15]\n"
"ldr x22, [x16, #0x78]\n"
- "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
- "ldr q12, [x26, x15]\n"
- "fmla v25.4s, v6.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v29.4s\n"
+ "ldr q11, [x26, x15]\n"
+ "fmla v27.4s, v6.4s, v9.4s\n"
"ldr x20, [x16, #0x80]\n"
- "fmla v28.4s, v4.4s, v9.4s\n"
- "fmla v20.4s, v3.4s, v9.4s\n"
+ "fmla v31.4s, v4.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
"add x14, x14, #0x10\n"
- "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
- "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v0.4s, v9.4s\n"
"ldr q30, [x17, #0x0]\n"
- "fmla v27.4s, v8.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v9.4s\n"
- "fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v13.4s, v8.4s, v9.4s\n"
+ "fmla v16.4s, v5.4s, v9.4s\n"
+ "fmla v24.4s, v2.4s, v9.4s\n"
"ldr q9, [x25, x15]\n"
- "fmla v17.4s, v1.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
"ldr x27, [x16, #0x90]\n"
- "fmla v16.4s, v0.4s, v11.4s\n"
- "ldr q11, [x21, x15]\n"
- "fmla v15.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x21, x15]\n"
+ "fmla v20.4s, v2.4s, v11.4s\n"
"ldr x21, [x16, #0x98]\n"
- "fmla v23.4s, v8.4s, v22.4s\n"
- "fmla v10.4s, v1.4s, v12.4s\n"
- "ldr q12, [x24, x15]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x24, x15]\n"
"ldr x26, [x16, #0xa0]\n"
- "fmla v25.4s, v7.4s, v22.4s\n"
- "fmla v21.4s, v6.4s, v22.4s\n"
- "fmla v28.4s, v5.4s, v22.4s\n"
- "fmla v20.4s, v4.4s, v22.4s\n"
- "fmla v19.4s, v3.4s, v22.4s\n"
- "fmla v26.4s, v2.4s, v22.4s\n"
- "fmla v18.4s, v1.4s, v22.4s\n"
- "fmla v24.4s, v0.4s, v22.4s\n"
- "ldr q22, [x23, x15]\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v22.4s, v6.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v10.4s\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x23, x15]\n"
"ldr x25, [x16, #0xa8]\n"
- "fmla v17.4s, v3.4s, v9.4s\n"
- "fmla v27.4s, v0.4s, v9.4s\n"
- "fmla v31.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v13.4s, v0.4s, v9.4s\n"
+ "fmla v16.4s, v6.4s, v11.4s\n"
+ "fmla v24.4s, v3.4s, v11.4s\n"
"ldr q9, [x13, x15]\n"
"ldr x24, [x16, #0xb0]\n"
- "fmla v16.4s, v4.4s, v22.4s\n"
- "fmla v15.4s, v3.4s, v22.4s\n"
- "fmla v23.4s, v1.4s, v22.4s\n"
- "fmla v10.4s, v5.4s, v11.4s\n"
- "fmla v21.4s, v2.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
"ldr q12, [x22, x15]\n"
- "fmla v25.4s, v0.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v10.4s\n"
"ldr x23, [x16, #0xb8]\n"
"fmla v19.4s, v8.4s, v9.4s\n"
- "fmla v24.4s, v5.4s, v9.4s\n"
- "ldr q11, [x20, x15]\n"
+ "fmla v17.4s, v5.4s, v9.4s\n"
+ "ldr q9, [x20, x15]\n"
"ldr x22, [x16, #0xc0]\n"
- "fmla v17.4s, v5.4s, v22.4s\n"
- "fmla v27.4s, v2.4s, v22.4s\n"
- "ldr q22, [x28, x15]\n"
+ "fmla v26.4s, v5.4s, v10.4s\n"
+ "fmla v13.4s, v2.4s, v10.4s\n"
+ "ldr q11, [x28, x15]\n"
"ldr x20, [x16, #0xc8]\n"
- "fmla v16.4s, v5.4s, v12.4s\n"
- "fmla v15.4s, v4.4s, v12.4s\n"
- "fmla v23.4s, v2.4s, v12.4s\n"
- "fmla v10.4s, v3.4s, v12.4s\n"
- "fmla v25.4s, v1.4s, v12.4s\n"
- "fmla v21.4s, v0.4s, v12.4s\n"
- "ldr q9, [x21, x15]\n"
+ "fmla v18.4s, v5.4s, v12.4s\n"
+ "fmla v20.4s, v4.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v3.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "fmla v22.4s, v0.4s, v12.4s\n"
+ "ldr q10, [x21, x15]\n"
"ldr x28, [x16, #0xd8]\n"
- "fmla v29.4s, v7.4s, v11.4s\n"
- "fmla v26.4s, v6.4s, v11.4s\n"
- "ldr q12, [x27, x15]\n"
+ "fmla v24.4s, v7.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr q9, [x27, x15]\n"
"ldr x21, [x16, #0xd0]\n"
- "fmla v17.4s, v7.4s, v22.4s\n"
- "fmla v16.4s, v6.4s, v22.4s\n"
- "fmla v27.4s, v4.4s, v22.4s\n"
- "fmla v23.4s, v3.4s, v22.4s\n"
- "fmla v31.4s, v1.4s, v22.4s\n"
- "fmla v28.4s, v0.4s, v22.4s\n"
- "ldr q11, [x26, x15]\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "fmla v18.4s, v6.4s, v11.4s\n"
+ "fmla v13.4s, v4.4s, v11.4s\n"
+ "fmla v21.4s, v3.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v31.4s, v0.4s, v11.4s\n"
+ "ldr q12, [x26, x15]\n"
"ldr x27, [x16, #0xe0]\n"
- "fmla v15.4s, v8.4s, v9.4s\n"
- "fmla v18.4s, v8.4s, v12.4s\n"
- "fmla v24.4s, v7.4s, v12.4s\n"
- "ldr q12, [x25, x15]\n"
- "fmla v19.4s, v1.4s, v9.4s\n"
+ "fmla v20.4s, v8.4s, v10.4s\n"
+ "fmla v23.4s, v8.4s, v9.4s\n"
+ "fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v19.4s, v1.4s, v10.4s\n"
"ldr x26, [x16, #0xe8]\n"
- "fmla v10.4s, v7.4s, v9.4s\n"
- "fmla v25.4s, v5.4s, v9.4s\n"
- "fmla v21.4s, v4.4s, v9.4s\n"
- "fmla v20.4s, v2.4s, v9.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v22.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
"ldr q9, [x24, x15]\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla v17.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v1.4s, v11.4s\n"
- "fmla v15.4s, v0.4s, v11.4s\n"
- "ldr q22, [x23, x15]\n"
- "fmla v27.4s, v7.4s, v12.4s\n"
- "ldr x25, [x16, #0xf8]\n"
- "fmla v23.4s, v6.4s, v12.4s\n"
- "fmla v31.4s, v4.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v12.4s\n"
- "fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v26.4s, v0.4s, v12.4s\n"
- "ldr q11, [x22, x15]\n"
- "fmla v19.4s, v4.4s, v11.4s\n"
- "ldr x23, [x16, #0x100]\n"
- "fmla v18.4s, v2.4s, v11.4s\n"
- "fmla v16.4s, v2.4s, v9.4s\n"
- "fmla v15.4s, v1.4s, v9.4s\n"
- "fmla v10.4s, v0.4s, v9.4s\n"
- "ldr q9, [x20, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "fmla v20.4s, v0.4s, v12.4s\n"
+ "ldr q12, [x23, x15]\n"
+ "fmla v13.4s, v7.4s, v11.4s\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "fmla v16.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "ldr q10, [x22, x15]\n"
+ "fmla v18.4s, v2.4s, v9.4s\n"
+ "ldr x22, [x16, #0x100]\n"
+ "fmla v20.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v0.4s, v9.4s\n"
+ "ldr q11, [x20, x15]\n"
"ldr x20, [x16, #0x108]\n"
- "fmla v17.4s, v6.4s, v22.4s\n"
- "fmla v27.4s, v3.4s, v22.4s\n"
- "fmla v31.4s, v0.4s, v22.4s\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "fmla v13.4s, v3.4s, v12.4s\n"
+ "fmla v19.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v12.4s\n"
+ "ldr q9, [x21, x15]\n"
+ "fmla v27.4s, v8.4s, v10.4s\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "ldr q10, [x28, x15]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
- "ldr x22, [x16, #0x110]\n"
- "fmla v21.4s, v7.4s, v11.4s\n"
- "fmla v20.4s, v5.4s, v11.4s\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q12, [x28, x15]\n"
- "fmla v19.4s, v2.4s, v9.4s\n"
"ldr x21, [x16, #0x118]\n"
- "fmla v29.4s, v0.4s, v22.4s\n"
- "fmla v26.4s, v4.4s, v12.4s\n"
- "fmla v18.4s, v3.4s, v12.4s\n"
- "fmla v10.4s, v8.4s, v9.4s\n"
- "fmla v21.4s, v5.4s, v9.4s\n"
- "ldr q11, [x27, x15]\n"
- "fmla v27.4s, v6.4s, v22.4s\n"
- "fmla v31.4s, v3.4s, v22.4s\n"
- "ldr q22, [x26, x15]\n"
- "fmla v28.4s, v7.4s, v12.4s\n"
- "fmla v20.4s, v6.4s, v12.4s\n"
- "fmla v29.4s, v5.4s, v12.4s\n"
- "fmla v19.4s, v5.4s, v11.4s\n"
- "fmla v24.4s, v2.4s, v11.4s\n"
- "fmla v26.4s, v7.4s, v22.4s\n"
- "fmla v18.4s, v6.4s, v22.4s\n"
- "fmla v31.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v9.4s\n"
+ "fmla v13.4s, v6.4s, v9.4s\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "ldr q9, [x26, x15]\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v23.4s, v3.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v11.4s\n"
+ "ldr q12, [x27, x15]\n"
+ "fmla v31.4s, v7.4s, v10.4s\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v5.4s, v10.4s\n"
+ "fmla v16.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x25, x15]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v7.4s, v9.4s\n"
+ "fmla v23.4s, v6.4s, v9.4s\n"
+ "fmla v22.4s, v8.4s, v12.4s\n"
"ldr q12, [x24, x15]\n"
- "fmla v29.4s, v8.4s, v22.4s\n"
- "ldr q22, [x23, x15]\n"
- "fmla v28.4s, v8.4s, v12.4s\n"
- "fmla v20.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
- "fmla v26.4s, v5.4s, v12.4s\n"
- "fmla v18.4s, v4.4s, v12.4s\n"
- "fmla v24.4s, v3.4s, v12.4s\n"
- "ldr q12, [x20, x15]\n"
- "ldp x20, x24, [x16, #0x0]\n"
- "ldr q9, [x20, x6]\n"
- "fmla v21.4s, v8.4s, v11.4s\n"
- "ldr q11, [x25, x15]\n"
- "fmla v17.4s, v4.4s, v22.4s\n"
- "fmla v16.4s, v3.4s, v22.4s\n"
- "fmla v15.4s, v5.4s, v12.4s\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmla v10.4s, v4.4s, v12.4s\n"
- "fmla v26.4s, v8.4s, v11.4s\n"
- "fmax v16.4s, v16.4s, v13.4s\n"
- "fmla v18.4s, v7.4s, v11.4s\n"
- "fmla v24.4s, v6.4s, v11.4s\n"
- "ldr q11, [x22, x15]\n"
- "fmax v15.4s, v15.4s, v13.4s\n"
- "fmla v27.4s, v1.4s, v22.4s\n"
- "fmla v23.4s, v0.4s, v22.4s\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v24.4s, v8.4s, v9.4s\n"
+ "ldr q9, [x22, x15]\n"
+ "fmla v31.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v7.4s, v10.4s\n"
+ "fmla v19.4s, v6.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v23.4s, v4.4s, v10.4s\n"
+ "ldr q11, [x20, x15]\n"
+ "fmla v26.4s, v4.4s, v9.4s\n"
+ "ldp x20, x22, [x16, #0x0]\n"
+ "fmla v18.4s, v3.4s, v9.4s\n"
+ "fmla v13.4s, v1.4s, v9.4s\n"
+ "fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q10, [x21, x15]\n"
"ldr q0, [x17, #0x10]\n"
- "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v20.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v7.4s, v12.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v17.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x23, x15]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
"ldr q2, [x17, #0x30]\n"
- "fmla v21.4s, v1.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v11.4s\n"
"ldr q1, [x17, #0x20]\n"
- "fmax v10.4s, v10.4s, v13.4s\n"
- "fmla v31.4s, v7.4s, v11.4s\n"
- "fmla v28.4s, v6.4s, v11.4s\n"
- "ldr q6, [x17, #0x70]\n"
- "fmla v20.4s, v8.4s, v22.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
"ldr q8, [x17, #0x90]\n"
- "fmla v19.4s, v7.4s, v22.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmla v19.4s, v7.4s, v10.4s\n"
+ "fmla v16.4s, v7.4s, v12.4s\n"
"ldr q7, [x17, #0x80]\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v14.4s\n"
- "str q17, [x12, x14]\n"
- "ldr x23, [x8, #0x20]\n"
- "fmin v15.4s, v15.4s, v14.4s\n"
- "fmin v10.4s, v10.4s, v14.4s\n"
- "str q16, [x11, x14]\n"
- "ldr x22, [x8, #0x28]\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v13.4s\n"
- "str q15, [x10, x14]\n"
- "ldr x21, [x8, #0x30]\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
- "fmax v21.4s, v21.4s, v13.4s\n"
- "str q10, [x9, x14]\n"
- "ldr x20, [x8, #0x38]\n"
- "fmla v29.4s, v4.4s, v11.4s\n"
- "fmla v26.4s, v3.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmla v24.4s, v4.4s, v12.4s\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
"ldr q3, [x17, #0x40]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmla v18.4s, v5.4s, v22.4s\n"
+ "fmax v13.4s, v13.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmla v23.4s, v5.4s, v10.4s\n"
"ldr q5, [x17, #0x60]\n"
- "fmla v24.4s, v4.4s, v22.4s\n"
- "ldr q10, [x24, x6]\n"
+ "ldr q11, [x21, x6]\n"
+ "ldr q12, [x20, x6]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q26, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "ldr q10, [x22, x6]\n"
"ldr q4, [x17, #0x50]\n"
- "fmin v23.4s, v23.4s, v14.4s\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
- "str q27, [x23, x14]\n"
+ "str q18, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmin v13.4s, v13.4s, v14.4s\n"
+ "str q20, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
"fmin v21.4s, v21.4s, v14.4s\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
- "str q23, [x22, x14]\n"
- "ldr x25, [x8, #0x40]\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmax v20.4s, v20.4s, v13.4s\n"
- "str q25, [x21, x14]\n"
- "ldr x23, [x8, #0x48]\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
- "str q21, [x20, x14]\n"
- "ldr x22, [x8, #0x50]\n"
- "ldr x24, [x8, #0x58]\n"
- "ldp x21, x20, [x16, #0x10]\n"
- "ldr q11, [x21, x6]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q25, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "str q13, [x23, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "str q21, [x22, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "str q27, [x21, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "str q22, [x20, x14]\n"
+ "ldr x20, [x8, #0x58]\n"
"fmin v31.4s, v31.4s, v14.4s\n"
"fmin v28.4s, v28.4s, v14.4s\n"
- "ldr q12, [x20, x6]\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
"fmin v19.4s, v19.4s, v14.4s\n"
- "str q31, [x25, x14]\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "str q28, [x23, x14]\n"
- "ldr x23, [x8, #0x60]\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v13.4s\n"
- "str q20, [x22, x14]\n"
- "ldr x22, [x8, #0x68]\n"
- "str q19, [x24, x14]\n"
- "ldr x21, [x8, #0x70]\n"
- "ldr x20, [x8, #0x78]\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
"add x6, x6, #0x10\n"
- "cmp x6, x7, LSL #4\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
"add x15, x15, #0x10\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "str q16, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "cmp x6, x7, LSL #4\n"
+ "str q31, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
"fmin v24.4s, v24.4s, v14.4s\n"
- "str q29, [x23, x14]\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q28, [x21, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
"add x17, x17, #0xa0\n"
- "str q26, [x22, x14]\n"
- "str q18, [x21, x14]\n"
- "str q24, [x20, x14]\n"
+ "str q19, [x20, x14]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "str q24, [x23, x14]\n"
+ "str q29, [x22, x14]\n"
+ "str q23, [x21, x14]\n"
+ "str q17, [x20, x14]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
- "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v8.4s, v9.4s\n"
"ldr x27, [x16, #0x20]\n"
"ldr x24, [x16, #0x30]\n"
- "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
- "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v13.16b, v30.16b\n fmla v13.4s, v3.4s, v9.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v1.4s, v9.4s\n"
"ldr x23, [x16, #0x28]\n"
"ldr x22, [x16, #0x38]\n"
- "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
- "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v0.4s, v9.4s\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v7.4s, v9.4s\n"
"ldr x26, [x16, #0x40]\n"
"ldr x21, [x16, #0x48]\n"
- "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
- "fmla v31.4s, v5.4s, v12.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v5.4s, v9.4s\n"
"ldr x25, [x16, #0x50]\n"
"ldr x20, [x16, #0x58]\n"
- "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
- "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
- "ldr q24, [x24, x15]\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q22, [x24, x15]\n"
"ldr x13, [x16, #0x70]\n"
- "fmla v17.4s, v0.4s, v10.4s\n"
- "ldr q22, [x27, x15]\n"
- "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
- "ldr q16, [x23, x15]\n"
- "fmla v15.4s, v4.4s, v12.4s\n"
- "fmla v29.4s, v2.4s, v12.4s\n"
+ "fmla v19.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x27, x15]\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q23, [x23, x15]\n"
+ "fmla v13.4s, v4.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
"ldr x24, [x16, #0x60]\n"
"ldr x23, [x16, #0x68]\n"
- "fmla v19.4s, v1.4s, v12.4s\n"
- "fmla v20.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v1.4s, v12.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
"ldr x12, [x8, #0x0]\n"
"ldr x11, [x8, #0x8]\n"
- "fmla v21.4s, v7.4s, v12.4s\n"
- "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
- "ldr q22, [x21, x15]\n"
+ "fmla v25.4s, v7.4s, v12.4s\n"
+ "mov v11.16b, v30.16b\n fmla v11.4s, v6.4s, v20.4s\n"
+ "ldr q9, [x21, x15]\n"
"ldr x28, [x16, #0x88]\n"
- "fmla v31.4s, v7.4s, v24.4s\n"
- "fmla v28.4s, v6.4s, v12.4s\n"
+ "fmla v16.4s, v7.4s, v22.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
"ldr x10, [x8, #0x10]\n"
"ldr x9, [x8, #0x18]\n"
- "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
- "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
- "ldr q23, [x22, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v3.4s, v12.4s\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v12.4s\n"
+ "ldr q21, [x22, x15]\n"
"ldr x22, [x16, #0x78]\n"
- "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
- "ldr q16, [x26, x15]\n"
- "fmla v15.4s, v6.4s, v24.4s\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v23.4s\n"
+ "ldr q23, [x26, x15]\n"
+ "fmla v13.4s, v6.4s, v22.4s\n"
"ldr x21, [x16, #0x80]\n"
- "fmla v29.4s, v4.4s, v24.4s\n"
- "fmla v19.4s, v3.4s, v24.4s\n"
+ "fmla v31.4s, v4.4s, v22.4s\n"
+ "fmla v17.4s, v3.4s, v22.4s\n"
"add x14, x14, #0x10\n"
- "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
- "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
- "fmla v18.4s, v8.4s, v24.4s\n"
- "fmla v27.4s, v5.4s, v24.4s\n"
- "fmla v10.4s, v2.4s, v24.4s\n"
- "ldr q24, [x25, x15]\n"
- "fmla v17.4s, v1.4s, v23.4s\n"
+ "mov v12.16b, v30.16b\n fmla v12.4s, v1.4s, v22.4s\n"
+ "fmla v30.4s, v0.4s, v22.4s\n"
+ "fmla v28.4s, v8.4s, v22.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x25, x15]\n"
+ "fmla v19.4s, v1.4s, v21.4s\n"
"ldr x27, [x16, #0x90]\n"
- "fmla v20.4s, v0.4s, v23.4s\n"
- "ldr q23, [x20, x15]\n"
- "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v18.4s, v0.4s, v21.4s\n"
+ "ldr q21, [x20, x15]\n"
+ "fmla v25.4s, v2.4s, v23.4s\n"
"ldr x20, [x16, #0x98]\n"
- "fmla v31.4s, v8.4s, v22.4s\n"
- "fmla v28.4s, v1.4s, v16.4s\n"
- "ldr q16, [x24, x15]\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v27.4s, v1.4s, v23.4s\n"
+ "ldr q20, [x24, x15]\n"
"ldr x26, [x16, #0xa0]\n"
- "fmla v15.4s, v7.4s, v22.4s\n"
- "fmla v9.4s, v6.4s, v22.4s\n"
- "fmla v29.4s, v5.4s, v22.4s\n"
- "fmla v19.4s, v4.4s, v22.4s\n"
- "fmla v11.4s, v3.4s, v22.4s\n"
- "fmla v26.4s, v2.4s, v22.4s\n"
- "fmla v25.4s, v1.4s, v22.4s\n"
- "fmla v12.4s, v0.4s, v22.4s\n"
- "ldr q22, [x23, x15]\n"
+ "fmla v13.4s, v7.4s, v9.4s\n"
+ "fmla v10.4s, v6.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v9.4s\n"
+ "fmla v17.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v12.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v24.4s, v0.4s, v9.4s\n"
+ "ldr q23, [x23, x15]\n"
"ldr x25, [x16, #0xa8]\n"
- "fmla v17.4s, v3.4s, v24.4s\n"
- "fmla v18.4s, v0.4s, v24.4s\n"
- "fmla v27.4s, v6.4s, v16.4s\n"
- "fmla v10.4s, v3.4s, v16.4s\n"
- "ldr q16, [x13, x15]\n"
+ "fmla v19.4s, v3.4s, v22.4s\n"
+ "fmla v28.4s, v0.4s, v22.4s\n"
+ "fmla v29.4s, v6.4s, v20.4s\n"
+ "fmla v11.4s, v3.4s, v20.4s\n"
+ "ldr q20, [x13, x15]\n"
"ldr x24, [x16, #0xb0]\n"
- "fmla v20.4s, v4.4s, v22.4s\n"
- "fmla v21.4s, v3.4s, v22.4s\n"
- "fmla v31.4s, v1.4s, v22.4s\n"
- "fmla v28.4s, v5.4s, v23.4s\n"
- "fmla v9.4s, v2.4s, v23.4s\n"
- "ldr q23, [x22, x15]\n"
- "fmla v15.4s, v0.4s, v22.4s\n"
+ "fmla v18.4s, v4.4s, v23.4s\n"
+ "fmla v25.4s, v3.4s, v23.4s\n"
+ "fmla v16.4s, v1.4s, v23.4s\n"
+ "fmla v27.4s, v5.4s, v21.4s\n"
+ "fmla v10.4s, v2.4s, v21.4s\n"
+ "ldr q22, [x22, x15]\n"
+ "fmla v13.4s, v0.4s, v23.4s\n"
"ldr x23, [x16, #0xb8]\n"
- "fmla v11.4s, v8.4s, v16.4s\n"
- "fmla v12.4s, v5.4s, v16.4s\n"
- "ldr q16, [x21, x15]\n"
+ "fmla v26.4s, v8.4s, v20.4s\n"
+ "fmla v24.4s, v5.4s, v20.4s\n"
+ "ldr q21, [x21, x15]\n"
"ldr x22, [x16, #0xc0]\n"
- "fmla v17.4s, v5.4s, v22.4s\n"
- "fmla v18.4s, v2.4s, v22.4s\n"
- "ldr q22, [x28, x15]\n"
+ "fmla v19.4s, v5.4s, v23.4s\n"
+ "fmla v28.4s, v2.4s, v23.4s\n"
+ "ldr q20, [x28, x15]\n"
"ldr x21, [x16, #0xc8]\n"
- "fmla v20.4s, v5.4s, v23.4s\n"
- "fmla v21.4s, v4.4s, v23.4s\n"
- "fmla v31.4s, v2.4s, v23.4s\n"
- "fmla v28.4s, v3.4s, v23.4s\n"
- "fmla v15.4s, v1.4s, v23.4s\n"
- "fmla v9.4s, v0.4s, v23.4s\n"
- "ldr q23, [x20, x15]\n"
+ "fmla v18.4s, v5.4s, v22.4s\n"
+ "fmla v25.4s, v4.4s, v22.4s\n"
+ "fmla v16.4s, v2.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v13.4s, v1.4s, v22.4s\n"
+ "fmla v10.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
"ldr x28, [x16, #0xd8]\n"
- "fmla v10.4s, v7.4s, v16.4s\n"
- "fmla v26.4s, v6.4s, v16.4s\n"
- "ldr q16, [x27, x15]\n"
+ "fmla v11.4s, v7.4s, v21.4s\n"
+ "fmla v12.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x27, x15]\n"
"ldr x20, [x16, #0xd0]\n"
- "fmla v17.4s, v7.4s, v22.4s\n"
- "fmla v20.4s, v6.4s, v22.4s\n"
- "fmla v18.4s, v4.4s, v22.4s\n"
- "fmla v31.4s, v3.4s, v22.4s\n"
- "fmla v27.4s, v1.4s, v22.4s\n"
- "fmla v29.4s, v0.4s, v22.4s\n"
- "ldr q22, [x26, x15]\n"
+ "fmla v19.4s, v7.4s, v20.4s\n"
+ "fmla v18.4s, v6.4s, v20.4s\n"
+ "fmla v28.4s, v4.4s, v20.4s\n"
+ "fmla v16.4s, v3.4s, v20.4s\n"
+ "fmla v29.4s, v1.4s, v20.4s\n"
+ "fmla v31.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x26, x15]\n"
"ldr x27, [x16, #0xe0]\n"
- "fmla v21.4s, v8.4s, v23.4s\n"
- "fmla v25.4s, v8.4s, v16.4s\n"
- "fmla v12.4s, v7.4s, v16.4s\n"
- "ldr q16, [x25, x15]\n"
- "fmla v11.4s, v1.4s, v23.4s\n"
+ "fmla v25.4s, v8.4s, v22.4s\n"
+ "fmla v30.4s, v8.4s, v21.4s\n"
+ "fmla v24.4s, v7.4s, v21.4s\n"
+ "ldr q21, [x25, x15]\n"
+ "fmla v26.4s, v1.4s, v22.4s\n"
"ldr x26, [x16, #0xe8]\n"
- "fmla v28.4s, v7.4s, v23.4s\n"
- "fmla v15.4s, v5.4s, v23.4s\n"
- "fmla v9.4s, v4.4s, v23.4s\n"
- "fmla v19.4s, v2.4s, v23.4s\n"
- "ldr q23, [x24, x15]\n"
- "ldr x25, [x16, #0xf0]\n"
+ "fmla v27.4s, v7.4s, v22.4s\n"
+ "fmla v13.4s, v5.4s, v22.4s\n"
+ "fmla v10.4s, v4.4s, v22.4s\n"
"fmla v17.4s, v2.4s, v22.4s\n"
- "fmla v20.4s, v1.4s, v22.4s\n"
- "fmla v21.4s, v0.4s, v22.4s\n"
- "ldr q22, [x23, x15]\n"
- "fmla v18.4s, v7.4s, v16.4s\n"
+ "ldr q22, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v19.4s, v2.4s, v20.4s\n"
+ "fmla v18.4s, v1.4s, v20.4s\n"
+ "fmla v25.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x23, x15]\n"
+ "fmla v28.4s, v7.4s, v21.4s\n"
"ldr x24, [x16, #0xf8]\n"
- "fmla v31.4s, v6.4s, v16.4s\n"
- "fmla v27.4s, v4.4s, v16.4s\n"
- "fmla v29.4s, v3.4s, v16.4s\n"
- "fmla v10.4s, v1.4s, v16.4s\n"
- "fmla v26.4s, v0.4s, v16.4s\n"
- "ldr q16, [x22, x15]\n"
- "fmla v11.4s, v4.4s, v16.4s\n"
+ "fmla v16.4s, v6.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmla v31.4s, v3.4s, v21.4s\n"
+ "fmla v11.4s, v1.4s, v21.4s\n"
+ "fmla v12.4s, v0.4s, v21.4s\n"
+ "ldr q21, [x22, x15]\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
"ldr x23, [x16, #0x100]\n"
- "fmla v25.4s, v2.4s, v16.4s\n"
- "fmla v20.4s, v2.4s, v23.4s\n"
- "fmla v21.4s, v1.4s, v23.4s\n"
- "fmla v28.4s, v0.4s, v23.4s\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v22.4s\n"
"ldr q23, [x21, x15]\n"
"ldr x22, [x16, #0x108]\n"
- "fmla v17.4s, v6.4s, v22.4s\n"
- "fmla v18.4s, v3.4s, v22.4s\n"
- "fmla v27.4s, v0.4s, v22.4s\n"
- "ldr q22, [x20, x15]\n"
- "fmla v15.4s, v8.4s, v16.4s\n"
+ "fmla v19.4s, v6.4s, v20.4s\n"
+ "fmla v28.4s, v3.4s, v20.4s\n"
+ "fmla v26.4s, v4.4s, v21.4s\n"
+ "fmla v30.4s, v2.4s, v21.4s\n"
+ "fmla v29.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x20, x15]\n"
+ "fmla v13.4s, v8.4s, v21.4s\n"
"ldr x21, [x16, #0x110]\n"
- "fmla v9.4s, v7.4s, v16.4s\n"
- "fmla v19.4s, v5.4s, v16.4s\n"
- "fmla v12.4s, v1.4s, v16.4s\n"
- "ldr q16, [x28, x15]\n"
- "fmla v11.4s, v2.4s, v23.4s\n"
+ "fmla v10.4s, v7.4s, v21.4s\n"
+ "fmla v17.4s, v5.4s, v21.4s\n"
+ "fmla v24.4s, v1.4s, v21.4s\n"
+ "ldr q21, [x28, x15]\n"
+ "fmla v27.4s, v8.4s, v23.4s\n"
"ldr x20, [x16, #0x118]\n"
- "fmla v10.4s, v0.4s, v22.4s\n"
- "fmla v26.4s, v4.4s, v16.4s\n"
- "fmla v25.4s, v3.4s, v16.4s\n"
- "fmla v28.4s, v8.4s, v23.4s\n"
- "fmla v9.4s, v5.4s, v23.4s\n"
- "ldr q23, [x27, x15]\n"
- "fmla v18.4s, v6.4s, v22.4s\n"
- "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v23.4s\n"
+ "fmla v11.4s, v0.4s, v20.4s\n"
+ "fmla v28.4s, v6.4s, v20.4s\n"
+ "fmla v29.4s, v3.4s, v20.4s\n"
"ldr q22, [x26, x15]\n"
- "fmla v29.4s, v7.4s, v16.4s\n"
- "fmla v19.4s, v6.4s, v16.4s\n"
- "fmla v10.4s, v5.4s, v16.4s\n"
- "fmla v11.4s, v5.4s, v23.4s\n"
- "fmla v12.4s, v2.4s, v23.4s\n"
- "fmla v26.4s, v7.4s, v22.4s\n"
- "fmla v25.4s, v6.4s, v22.4s\n"
- "fmla v27.4s, v8.4s, v16.4s\n"
- "ldr q16, [x25, x15]\n"
- "fmla v10.4s, v8.4s, v22.4s\n"
- "ldr q30, [x23, x15]\n"
- "fmla v29.4s, v8.4s, v16.4s\n"
- "fmla v19.4s, v7.4s, v16.4s\n"
- "fmla v11.4s, v6.4s, v16.4s\n"
- "fmla v26.4s, v5.4s, v16.4s\n"
- "fmla v25.4s, v4.4s, v16.4s\n"
- "fmla v12.4s, v3.4s, v16.4s\n"
- "ldr q24, [x22, x15]\n"
- "fmla v9.4s, v8.4s, v23.4s\n"
- "ldr q16, [x24, x15]\n"
- "fmla v17.4s, v4.4s, v30.4s\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmla v20.4s, v3.4s, v30.4s\n"
- "fmla v21.4s, v5.4s, v24.4s\n"
- "fmax v20.4s, v20.4s, v13.4s\n"
- "fmla v28.4s, v4.4s, v24.4s\n"
- "fmla v26.4s, v8.4s, v16.4s\n"
- "fmax v21.4s, v21.4s, v13.4s\n"
- "fmla v25.4s, v7.4s, v16.4s\n"
- "fmla v12.4s, v6.4s, v16.4s\n"
- "ldr q23, [x21, x15]\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmla v18.4s, v1.4s, v30.4s\n"
- "fmla v31.4s, v0.4s, v30.4s\n"
- "ldr q16, [x20, x15]\n"
- "fmin v17.4s, v17.4s, v14.4s\n"
- "fmla v15.4s, v2.4s, v24.4s\n"
- "fmla v9.4s, v1.4s, v24.4s\n"
- "fmin v20.4s, v20.4s, v14.4s\n"
- "str q17, [x12, x14]\n"
- "fmla v27.4s, v7.4s, v23.4s\n"
- "fmla v29.4s, v6.4s, v23.4s\n"
- "fmin v21.4s, v21.4s, v14.4s\n"
- "str q20, [x11, x14]\n"
- "fmla v19.4s, v8.4s, v16.4s\n"
- "fmla v11.4s, v7.4s, v16.4s\n"
- "fmin v28.4s, v28.4s, v14.4s\n"
- "str q21, [x10, x14]\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
- "str q28, [x9, x14]\n"
+ "fmla v12.4s, v4.4s, v21.4s\n"
+ "fmla v30.4s, v3.4s, v21.4s\n"
+ "fmla v10.4s, v5.4s, v23.4s\n"
+ "ldr q20, [x27, x15]\n"
+ "fmla v31.4s, v7.4s, v21.4s\n"
+ "fmla v17.4s, v6.4s, v21.4s\n"
+ "fmla v11.4s, v5.4s, v21.4s\n"
+ "fmla v29.4s, v8.4s, v21.4s\n"
+ "ldr q21, [x25, x15]\n"
+ "fmla v26.4s, v5.4s, v20.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v12.4s, v7.4s, v22.4s\n"
+ "fmla v30.4s, v6.4s, v22.4s\n"
+ "fmla v10.4s, v8.4s, v20.4s\n"
+ "ldr q20, [x24, x15]\n"
+ "fmla v11.4s, v8.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v31.4s, v8.4s, v21.4s\n"
+ "fmla v17.4s, v7.4s, v21.4s\n"
+ "fmla v26.4s, v6.4s, v21.4s\n"
+ "fmla v12.4s, v5.4s, v21.4s\n"
+ "fmla v24.4s, v3.4s, v21.4s\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
+ "ldr q21, [x22, x15]\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v18.4s, v3.4s, v22.4s\n"
+ "fmla v28.4s, v1.4s, v22.4s\n"
+ "fmla v16.4s, v0.4s, v22.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v25.4s, v5.4s, v21.4s\n"
+ "fmla v27.4s, v4.4s, v21.4s\n"
+ "fmla v12.4s, v8.4s, v20.4s\n"
+ "fmla v30.4s, v7.4s, v20.4s\n"
+ "fmla v24.4s, v6.4s, v20.4s\n"
+ "ldr q0, [x21, x15]\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmla v13.4s, v2.4s, v21.4s\n"
+ "fmla v10.4s, v1.4s, v21.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmla v17.4s, v8.4s, v23.4s\n"
+ "fmla v26.4s, v7.4s, v23.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmla v29.4s, v7.4s, v0.4s\n"
+ "fmla v31.4s, v6.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmla v11.4s, v4.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v12.4s, v3.4s, v0.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
+ "fmla v30.4s, v5.4s, v23.4s\n"
+ "fmax v13.4s, v13.4s, v15.4s\n"
+ "fmax v10.4s, v10.4s, v15.4s\n"
+ "str q19, [x12, x14]\n"
"ldr x23, [x8, #0x20]\n"
- "fmax v15.4s, v15.4s, v13.4s\n"
- "fmax v9.4s, v9.4s, v13.4s\n"
+ "str q18, [x11, x14]\n"
"ldr x22, [x8, #0x28]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "str q25, [x10, x14]\n"
"ldr x21, [x8, #0x30]\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "str q27, [x9, x14]\n"
"ldr x20, [x8, #0x38]\n"
- "fmla v10.4s, v4.4s, v23.4s\n"
- "fmla v26.4s, v3.4s, v23.4s\n"
- "fmin v18.4s, v18.4s, v14.4s\n"
- "fmla v25.4s, v5.4s, v16.4s\n"
- "fmla v12.4s, v4.4s, v16.4s\n"
- "fmin v31.4s, v31.4s, v14.4s\n"
- "str q18, [x23, x14]\n"
- "fmin v15.4s, v15.4s, v14.4s\n"
- "fmin v9.4s, v9.4s, v14.4s\n"
- "str q31, [x22, x14]\n"
+ "fmin v13.4s, v13.4s, v14.4s\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "str q28, [x23, x14]\n"
"ldr x23, [x8, #0x40]\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
- "str q15, [x21, x14]\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "str q16, [x22, x14]\n"
"ldr x22, [x8, #0x48]\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
- "fmax v11.4s, v11.4s, v13.4s\n"
- "str q9, [x20, x14]\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q13, [x21, x14]\n"
"ldr x21, [x8, #0x50]\n"
+ "fmax v11.4s, v11.4s, v15.4s\n"
+ "fmax v12.4s, v12.4s, v15.4s\n"
+ "str q10, [x20, x14]\n"
"ldr x20, [x8, #0x58]\n"
- "fmin v27.4s, v27.4s, v14.4s\n"
- "fmin v29.4s, v29.4s, v14.4s\n"
- "str q27, [x23, x14]\n"
- "fmin v19.4s, v19.4s, v14.4s\n"
- "fmin v11.4s, v11.4s, v14.4s\n"
- "str q29, [x22, x14]\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "str q29, [x23, x14]\n"
"ldr x23, [x8, #0x60]\n"
- "fmax v10.4s, v10.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "str q19, [x21, x14]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmin v11.4s, v11.4s, v14.4s\n"
+ "str q31, [x22, x14]\n"
"ldr x22, [x8, #0x68]\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
- "fmax v12.4s, v12.4s, v13.4s\n"
- "str q11, [x20, x14]\n"
+ "str q17, [x21, x14]\n"
"ldr x21, [x8, #0x70]\n"
- "ldr x20, [x8, #0x78]\n"
- "fmin v10.4s, v10.4s, v14.4s\n"
- "fmin v26.4s, v26.4s, v14.4s\n"
- "str q10, [x23, x14]\n"
- "fmin v25.4s, v25.4s, v14.4s\n"
"fmin v12.4s, v12.4s, v14.4s\n"
- "str q26, [x22, x14]\n"
- "add x15, x15, #0x10\n"
- "str q25, [x21, x14]\n"
- "str q12, [x20, x14]\n"
+ "str q26, [x20, x14]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q11, [x23, x14]\n"
+ "str q12, [x22, x14]\n"
+ "str q30, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 72f\n"
@@ -715,10 +715,10 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr q8, [x17, #0x90]\n"
"ldr x23, [x16, #0x0]\n"
"ldr x22, [x16, #0x8]\n"
- "add x23, x23, x15\n"
- "add x22, x22, x15\n"
"ldr x21, [x16, #0x10]\n"
"ldr x20, [x16, #0x18]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
"add x21, x21, x15\n"
"add x20, x20, x15\n"
"tbz %x[n_channels], #1, 4f\n"
@@ -741,20 +741,20 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
"mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
"ldr x20, [x16, #0x20]\n"
- "add x20, x20, x15\n"
"mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
"mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
"mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
"mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
"mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
"mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
"mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
"fmla v16.4s, v0.4s, v10.4s\n"
"fmla v17.4s, v8.4s, v12.4s\n"
"fmla v18.4s, v7.4s, v12.4s\n"
- "fmla v19.4s, v6.4s, v12.4s\n"
"fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
"fmla v22.4s, v4.4s, v12.4s\n"
"mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
"fmla v25.4s, v2.4s, v12.4s\n"
@@ -793,13 +793,13 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x38]\n"
"fmla v20.4s, v8.4s, v9.4s\n"
"fmla v21.4s, v7.4s, v9.4s\n"
- "add x20, x20, x15\n"
"fmla v22.4s, v6.4s, v9.4s\n"
"fmla v24.4s, v5.4s, v9.4s\n"
"fmla v25.4s, v4.4s, v9.4s\n"
"fmla v26.4s, v3.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v9.4s\n"
"mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "add x20, x20, x15\n"
"fmla v30.4s, v0.4s, v9.4s\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
@@ -836,13 +836,13 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x50]\n"
"fmla v21.4s, v8.4s, v10.4s\n"
"fmla v22.4s, v7.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v23.4s, v6.4s, v10.4s\n"
"fmla v25.4s, v5.4s, v10.4s\n"
"fmla v26.4s, v4.4s, v10.4s\n"
"fmla v27.4s, v3.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v10.4s\n"
"fmla v30.4s, v1.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"fmla v31.4s, v0.4s, v10.4s\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
@@ -891,11 +891,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x70]\n"
"fmla v16.4s, v5.4s, v10.4s\n"
"fmla v17.4s, v4.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v18.4s, v3.4s, v10.4s\n"
"fmla v20.4s, v2.4s, v10.4s\n"
"fmla v21.4s, v1.4s, v10.4s\n"
"fmla v22.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -919,11 +919,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x80]\n"
"fmla v17.4s, v5.4s, v12.4s\n"
"fmla v18.4s, v4.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v19.4s, v3.4s, v12.4s\n"
"fmla v21.4s, v2.4s, v12.4s\n"
"fmla v22.4s, v1.4s, v12.4s\n"
"fmla v23.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 30f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
@@ -947,11 +947,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x90]\n"
"fmla v16.4s, v7.4s, v10.4s\n"
"fmla v17.4s, v6.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v20.4s, v4.4s, v10.4s\n"
"fmla v21.4s, v3.4s, v10.4s\n"
"fmla v24.4s, v1.4s, v10.4s\n"
"fmla v25.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -975,11 +975,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xa0]\n"
"fmla v18.4s, v8.4s, v12.4s\n"
"fmla v19.4s, v7.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v22.4s, v5.4s, v12.4s\n"
"fmla v23.4s, v4.4s, v12.4s\n"
"fmla v26.4s, v2.4s, v12.4s\n"
"fmla v27.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
@@ -991,8 +991,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xa8]\n"
"fmla v16.4s, v2.4s, v10.4s\n"
"fmla v17.4s, v1.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v18.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 40f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
@@ -1004,11 +1004,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xb0]\n"
"fmla v20.4s, v7.4s, v11.4s\n"
"fmla v21.4s, v6.4s, v11.4s\n"
- "add x20, x20, x15\n"
"fmla v24.4s, v4.4s, v11.4s\n"
"fmla v25.4s, v3.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 42f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
@@ -1020,8 +1020,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xb8]\n"
"fmla v17.4s, v2.4s, v12.4s\n"
"fmla v18.4s, v1.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v19.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
@@ -1033,8 +1033,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xc0]\n"
"fmla v16.4s, v6.4s, v10.4s\n"
"fmla v20.4s, v3.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v24.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 46f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 47f\n"
@@ -1046,11 +1046,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xc8]\n"
"fmla v22.4s, v8.4s, v11.4s\n"
"fmla v23.4s, v7.4s, v11.4s\n"
- "add x20, x20, x15\n"
"fmla v26.4s, v5.4s, v11.4s\n"
"fmla v27.4s, v4.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v11.4s\n"
"fmla v31.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 48f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 49f\n"
@@ -1062,8 +1062,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xd0]\n"
"fmla v19.4s, v8.4s, v12.4s\n"
"fmla v23.4s, v5.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v27.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 50f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 51f\n"
@@ -1075,8 +1075,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xd8]\n"
"fmla v20.4s, v6.4s, v10.4s\n"
"fmla v24.4s, v3.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v28.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 52f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 53f\n"
@@ -1088,11 +1088,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xe0]\n"
"fmla v24.4s, v8.4s, v11.4s\n"
"fmla v25.4s, v7.4s, v11.4s\n"
- "add x20, x20, x15\n"
"fmla v26.4s, v6.4s, v11.4s\n"
"fmla v28.4s, v5.4s, v11.4s\n"
"fmla v29.4s, v4.4s, v11.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 54f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 55f\n"
@@ -1104,8 +1104,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xe8]\n"
"fmla v23.4s, v8.4s, v12.4s\n"
"fmla v27.4s, v5.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v31.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 56f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 57f\n"
@@ -1117,8 +1117,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xf0]\n"
"fmla v28.4s, v8.4s, v10.4s\n"
"fmla v29.4s, v7.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v30.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 58f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 59f\n"
@@ -1130,11 +1130,11 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0xf8]\n"
"fmla v25.4s, v8.4s, v11.4s\n"
"fmla v26.4s, v7.4s, v11.4s\n"
- "add x20, x20, x15\n"
"fmla v27.4s, v6.4s, v11.4s\n"
"fmla v29.4s, v5.4s, v11.4s\n"
"fmla v30.4s, v4.4s, v11.4s\n"
"fmla v31.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 60f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 61f\n"
@@ -1146,8 +1146,8 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x100]\n"
"fmla v29.4s, v8.4s, v12.4s\n"
"fmla v30.4s, v7.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v31.4s, v6.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 62f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 63f\n"
@@ -1159,9 +1159,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x108]\n"
"fmla v16.4s, v4.4s, v10.4s\n"
"fmla v17.4s, v3.4s, v10.4s\n"
- "add x20, x20, x15\n"
"fmla v20.4s, v1.4s, v10.4s\n"
"fmla v21.4s, v0.4s, v10.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 64f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 65f\n"
@@ -1173,9 +1173,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x110]\n"
"fmla v18.4s, v5.4s, v11.4s\n"
"fmla v19.4s, v4.4s, v11.4s\n"
- "add x20, x20, x15\n"
"fmla v22.4s, v2.4s, v11.4s\n"
"fmla v23.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 66f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 67f\n"
@@ -1187,9 +1187,9 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x20, [x16, #0x118]\n"
"fmla v24.4s, v7.4s, v12.4s\n"
"fmla v25.4s, v6.4s, v12.4s\n"
- "add x20, x20, x15\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x15\n"
"tbz %x[n_channels], #1, 68f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 69f\n"
@@ -1200,24 +1200,24 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"69:" // Oddments: Load input (4, 4): Bit 1: End
"fmla v26.4s, v8.4s, v10.4s\n"
"fmla v27.4s, v7.4s, v10.4s\n"
- "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmax v16.4s, v16.4s, v15.4s\n"
"fmla v30.4s, v5.4s, v10.4s\n"
"fmla v31.4s, v4.4s, v10.4s\n"
- "fmax v17.4s, v17.4s, v13.4s\n"
- "fmax v18.4s, v18.4s, v13.4s\n"
- "fmax v19.4s, v19.4s, v13.4s\n"
- "fmax v20.4s, v20.4s, v13.4s\n"
- "fmax v21.4s, v21.4s, v13.4s\n"
- "fmax v22.4s, v22.4s, v13.4s\n"
- "fmax v23.4s, v23.4s, v13.4s\n"
- "fmax v24.4s, v24.4s, v13.4s\n"
- "fmax v25.4s, v25.4s, v13.4s\n"
- "fmax v26.4s, v26.4s, v13.4s\n"
- "fmax v27.4s, v27.4s, v13.4s\n"
- "fmax v28.4s, v28.4s, v13.4s\n"
- "fmax v29.4s, v29.4s, v13.4s\n"
- "fmax v30.4s, v30.4s, v13.4s\n"
- "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmax v17.4s, v17.4s, v15.4s\n"
+ "fmax v18.4s, v18.4s, v15.4s\n"
+ "fmax v19.4s, v19.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
"fmin v16.4s, v16.4s, v14.4s\n"
"fmin v17.4s, v17.4s, v14.4s\n"
"fmin v18.4s, v18.4s, v14.4s\n"
@@ -1237,150 +1237,150 @@ void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"tbz %x[n_channels], #1, 70f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.d }[0], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.d }[0], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.d }[0], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.d }[0], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.d }[0], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.d }[0], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.d }[0], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.d }[0], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.d }[0], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.d }[0], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.d }[0], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.d }[0], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
- "add x14, x14, #0x8\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.d }[0], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.d }[0], [x22]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
"st1 { v30.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 71f\n"
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.s }[2], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.s }[2], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.s }[2], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.s }[2], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.s }[2], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.s }[2], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.s }[2], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.s }[2], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.s }[2], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.s }[2], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.s }[2], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.s }[2], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.s }[2], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.s }[2], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.s }[2], [x21]\n"
"st1 { v31.s }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Store: Bit 1: Unset
"ldr x23, [x8, #0x0]\n"
"ldr x22, [x8, #0x8]\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
"ldr x21, [x8, #0x10]\n"
"ldr x20, [x8, #0x18]\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
"st1 { v16.s }[0], [x23]\n"
"ldr x23, [x8, #0x20]\n"
- "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v17.s }[0], [x22]\n"
"ldr x22, [x8, #0x28]\n"
- "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"st1 { v18.s }[0], [x21]\n"
"ldr x21, [x8, #0x30]\n"
- "add x21, x21, x14\n"
"st1 { v19.s }[0], [x20]\n"
"ldr x20, [x8, #0x38]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v20.s }[0], [x23]\n"
"ldr x23, [x8, #0x40]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v21.s }[0], [x22]\n"
"ldr x22, [x8, #0x48]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v22.s }[0], [x21]\n"
"ldr x21, [x8, #0x50]\n"
- "add x21, x21, x14\n"
"st1 { v23.s }[0], [x20]\n"
"ldr x20, [x8, #0x58]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v24.s }[0], [x23]\n"
"ldr x23, [x8, #0x60]\n"
- "add x23, x23, x14\n"
+ "add x21, x21, x14\n"
"st1 { v25.s }[0], [x22]\n"
"ldr x22, [x8, #0x68]\n"
- "add x22, x22, x14\n"
+ "add x20, x20, x14\n"
"st1 { v26.s }[0], [x21]\n"
"ldr x21, [x8, #0x70]\n"
- "add x21, x21, x14\n"
"st1 { v27.s }[0], [x20]\n"
"ldr x20, [x8, #0x78]\n"
- "add x20, x20, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
"st1 { v28.s }[0], [x23]\n"
+ "add x21, x21, x14\n"
"st1 { v29.s }[0], [x22]\n"
+ "add x20, x20, x14\n"
"st1 { v30.s }[0], [x21]\n"
"st1 { v31.s }[0], [x20]\n"
"71:" // Oddments: Store: Bit 1: End
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 5ab61fad4c..ca61372e1c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,259 +87,259 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x23, #0x0\n"
- "mov x27, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x10, #0x0\n"
"1:" // Tile loop
- "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x26, #0x4\n"
- "mov x25, #0x2\n"
- "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x9, #0x4\n"
+ "mov x28, #0x2\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
"ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x6, x6, #0x2\n"
- "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
- "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x16, x8, x24, LSL #2\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
- "lsr x22, %x[n_channels], #0x2\n"
- "add x14, x16, x24, LSL #2\n"
- "mul x20, x20, x25\n" // offset *= output_tile_size
- "add x13, x6, x6\n"
- "add x12, x14, x24, LSL #2\n"
- "add x11, x13, x6\n"
- "add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "lsr x24, %x[n_channels], #0x2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v26.4s }, [x20]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v27.4s }, [x20]\n"
- "add x10, x12, x24, LSL #2\n"
- "add x9, x11, x6\n"
- "add x28, x17, x21, LSL #2\n"
+ "mul x22, x11, x27\n" // offset = tile_i * ld_input_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x26\n"
+ "mul x20, x11, x25\n" // offset = tile_i * ld_output_row
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x10, x6, x22\n" // offset += tile_j * ld_input_col
+ "lsl x6, x6, #0x2\n"
+ "madd x20, x10, x7, x20\n" // offset += tile_j * ld_output_col
"lsl x7, x7, #0x2\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q31, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "add x15, x15, #0xa0\n"
- "ldr q9, [x14, x13]\n"
+ "mul x22, x22, x9\n" // offset *= kernel_stride * output_size
+ "add x15, x6, x6\n"
+ "add x14, x15, x6\n"
+ "add x13, x14, x6\n"
+ "mul x20, x20, x28\n" // offset *= output_tile_size
+ "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x12, x8, x27, LSL #2\n"
+ "add x11, x12, x27, LSL #2\n"
+ "add x10, x11, x27, LSL #2\n"
+ "add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x9, x10, x27, LSL #2\n"
+ "add x28, x17, x25, LSL #2\n"
+ "cbz x24, 4f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x26, x24, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x11, x15]\n"
"ld1 { v10.4s }, [x8]\n"
"ldr q11, [x8, x6]\n"
- "ldr q12, [x8, x11]\n"
- "ldr q13, [x8, x9]\n"
- "ld1 { v14.4s }, [x16]\n"
- "ldr q15, [x16, x6]\n"
- "ldr q16, [x8, x13]\n"
+ "ldr q12, [x8, x14]\n"
+ "ldr q13, [x8, x13]\n"
+ "ld1 { v14.4s }, [x12]\n"
+ "ldr q15, [x12, x6]\n"
+ "ldr q16, [x8, x15]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
"mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
- "add x23, x23, #0x10\n"
+ "add x26, x26, #0x10\n"
"add x8, x8, #0x10\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x16, #0x0]\n"
+ "cmp x26, x24, LSL #4\n"
+ "add x21, x21, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v29.4s, v0.4s, v10.4s\n"
"ld1 { v10.4s }, [x8]\n"
"fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q21, [x16, x9]\n"
+ "ldr q21, [x12, x13]\n"
"fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q18, [x16, x11]\n"
+ "ldr q18, [x12, x14]\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q17, [x16, x13]\n"
+ "ldr q20, [x12, x15]\n"
+ "add x12, x12, #0x10\n"
"fmla v29.4s, v3.4s, v14.4s\n"
- "ld1 { v20.4s }, [x12]\n"
+ "ld1 { v17.4s }, [x10]\n"
"fmla v28.4s, v0.4s, v16.4s\n"
- "add x16, x16, #0x10\n"
"fmla v29.4s, v4.4s, v15.4s\n"
- "ld1 { v25.4s }, [x14]\n"
+ "ld1 { v23.4s }, [x11]\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr q19, [x10, x13]\n"
"fmla v28.4s, v4.4s, v18.4s\n"
- "ldr q19, [x12, x6]\n"
+ "ldr q17, [x10, x6]\n"
"fmla v29.4s, v2.4s, v16.4s\n"
- "ldr q18, [x14, x6]\n"
+ "ldr q22, [x11, x6]\n"
"fmla v28.4s, v5.4s, v21.4s\n"
- "ldr q24, [x14, x11]\n"
- "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
- "ldr q31, [x15, #0x0]\n"
- "cmp x23, x22, LSL #4\n"
- "fmla v29.4s, v5.4s, v17.4s\n"
- "fmla v28.4s, v3.4s, v17.4s\n"
- "ldr q17, [x12, x11]\n"
- "add x20, x20, #0x10\n"
- "fmla v23.4s, v3.4s, v20.4s\n"
- "ldr q16, [x12, x9]\n"
- "fmla v22.4s, v4.4s, v17.4s\n"
- "ldr q21, [x10, x6]\n"
- "fmla v23.4s, v0.4s, v25.4s\n"
- "ldr q0, [x15, #0x10]\n"
- "fmla v22.4s, v1.4s, v24.4s\n"
- "add x21, x21, #0x10\n"
- "fmla v23.4s, v4.4s, v19.4s\n"
- "ldr q20, [x14, x9]\n"
- "ldr q4, [x15, #0x50]\n"
- "fmla v22.4s, v5.4s, v16.4s\n"
- "ldr q19, [x10, x11]\n"
- "fmla v29.4s, v6.4s, v25.4s\n"
- "ld1 { v17.4s }, [x10]\n"
- "fmla v23.4s, v1.4s, v18.4s\n"
- "ldr q1, [x15, #0x20]\n"
- "fmla v22.4s, v2.4s, v20.4s\n"
- "ldr q2, [x15, #0x30]\n"
- "fmla v29.4s, v7.4s, v18.4s\n"
- "ldr q16, [x12, x13]\n"
- "fmla v23.4s, v6.4s, v17.4s\n"
- "ldr q18, [x10, x13]\n"
- "fmla v22.4s, v3.4s, v16.4s\n"
- "ldr q3, [x15, #0x40]\n"
- "fmla v23.4s, v7.4s, v21.4s\n"
- "ldr q13, [x8, x9]\n"
- "fmla v22.4s, v7.4s, v19.4s\n"
- "ld1 { v14.4s }, [x16]\n"
- "fmla v28.4s, v7.4s, v24.4s\n"
- "ldr q12, [x8, x11]\n"
- "fmla v23.4s, v5.4s, v16.4s\n"
- "ldr q16, [x8, x13]\n"
- "ldr q5, [x15, #0x60]\n"
- "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x11, x14]\n"
+ "fmla v25.4s, v0.4s, v23.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v29.4s, v5.4s, v20.4s\n"
+ "fmla v28.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x10, x14]\n"
+ "fmla v24.4s, v4.4s, v16.4s\n"
+ "ldr q21, [x9, x6]\n"
+ "fmla v25.4s, v4.4s, v17.4s\n"
+ "ldr q20, [x11, x13]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, #0x10\n"
+ "ldr q9, [x11, x15]\n"
+ "fmla v29.4s, v6.4s, v23.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "ldr q12, [x8, x14]\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v24.4s, v5.4s, v19.4s\n"
+ "ldr q19, [x9, x14]\n"
+ "fmla v29.4s, v7.4s, v22.4s\n"
+ "ldr q16, [x10, x15]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ldr q18, [x9, x15]\n"
"fmla v28.4s, v8.4s, v20.4s\n"
- "ldr q17, [x10, x9]\n"
- "ldr q6, [x15, #0x70]\n"
- "fmla v23.4s, v8.4s, v18.4s\n"
- "fmla v22.4s, v8.4s, v17.4s\n"
- "ldr q11, [x8, x6]\n"
- "ldr q15, [x16, x6]\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "ldr q17, [x9, x13]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x9, x9, #0x10\n"
"fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v25.4s, v7.4s, v21.4s\n"
+ "ldr q13, [x8, x13]\n"
"fmax v28.4s, v28.4s, v26.4s\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
- "fmax v23.4s, v23.4s, v26.4s\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
- "add x14, x14, #0x10\n"
- "ldr q9, [x14, x13]\n"
+ "fmla v24.4s, v3.4s, v16.4s\n"
+ "ldr q3, [x16, #0x40]\n"
"fmin v29.4s, v29.4s, v27.4s\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x8, x15]\n"
+ "ldr q5, [x16, #0x60]\n"
"fmin v28.4s, v28.4s, v27.4s\n"
- "fmin v23.4s, v23.4s, v27.4s\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "add x12, x12, #0x10\n"
- "add x10, x10, #0x10\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "ld1 { v14.4s }, [x12]\n"
+ "ldr q7, [x16, #0x80]\n"
"st1 { v29.4s }, [x17]\n"
- "add x15, x15, #0xa0\n"
+ "fmla v25.4s, v8.4s, v18.4s\n"
"str q28, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "st1 { v23.4s }, [x28]\n"
- "str q22, [x28, x7]\n"
+ "fmla v24.4s, v6.4s, v18.4s\n"
+ "ldr q15, [x12, x6]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "st1 { v25.4s }, [x28]\n"
+ "str q24, [x28, x7]\n"
"add x28, x28, #0x10\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
- "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
"add x8, x8, #0x10\n"
- "fmla v29.4s, v0.4s, v10.4s\n"
- "fmla v28.4s, v1.4s, v12.4s\n"
- "ldr q20, [x16, x9]\n"
- "fmla v29.4s, v1.4s, v11.4s\n"
- "ldr q18, [x16, x11]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "ldr q17, [x16, x13]\n"
- "fmla v29.4s, v3.4s, v14.4s\n"
- "ld1 { v19.4s }, [x12]\n"
- "fmla v28.4s, v0.4s, v16.4s\n"
- "add x16, x16, #0x10\n"
- "fmla v29.4s, v4.4s, v15.4s\n"
- "ld1 { v25.4s }, [x14]\n"
- "fmla v28.4s, v4.4s, v18.4s\n"
- "ldr q18, [x12, x6]\n"
- "fmla v29.4s, v2.4s, v16.4s\n"
- "ldr q24, [x14, x6]\n"
- "fmla v28.4s, v5.4s, v20.4s\n"
- "ldr q23, [x14, x11]\n"
- "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
- "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
- "fmla v29.4s, v5.4s, v17.4s\n"
- "fmla v28.4s, v3.4s, v17.4s\n"
- "ldr q17, [x12, x11]\n"
- "fmla v22.4s, v3.4s, v19.4s\n"
- "ldr q16, [x12, x9]\n"
- "fmla v21.4s, v4.4s, v17.4s\n"
- "ldr q20, [x10, x6]\n"
- "fmla v22.4s, v0.4s, v25.4s\n"
- "fmla v21.4s, v1.4s, v23.4s\n"
- "fmla v22.4s, v4.4s, v18.4s\n"
- "ldr q19, [x14, x9]\n"
- "fmla v21.4s, v5.4s, v16.4s\n"
- "ldr q18, [x10, x11]\n"
- "fmla v29.4s, v6.4s, v25.4s\n"
- "ld1 { v17.4s }, [x10]\n"
- "fmla v22.4s, v1.4s, v24.4s\n"
- "add x14, x14, #0x10\n"
- "fmla v21.4s, v2.4s, v19.4s\n"
- "fmla v29.4s, v7.4s, v24.4s\n"
- "ldr q16, [x12, x13]\n"
- "fmax v29.4s, v29.4s, v26.4s\n"
- "fmla v22.4s, v6.4s, v17.4s\n"
- "ldr q17, [x10, x13]\n"
- "fmla v21.4s, v3.4s, v16.4s\n"
- "fmin v29.4s, v29.4s, v27.4s\n"
- "fmla v22.4s, v7.4s, v20.4s\n"
- "fmla v21.4s, v7.4s, v18.4s\n"
- "st1 { v29.4s }, [x17]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x12, x13]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x12, x14]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x12, x15]\n"
"add x12, x12, #0x10\n"
- "fmla v28.4s, v7.4s, v23.4s\n"
- "fmla v22.4s, v5.4s, v16.4s\n"
- "fmla v21.4s, v6.4s, v17.4s\n"
- "fmla v28.4s, v8.4s, v19.4s\n"
- "ldr q16, [x10, x9]\n"
- "fmax v28.4s, v28.4s, v26.4s\n"
- "fmla v22.4s, v8.4s, v17.4s\n"
- "fmla v21.4s, v8.4s, v16.4s\n"
- "fmax v22.4s, v22.4s, v26.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr q23, [x10, x13]\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "ld1 { v22.4s }, [x11]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x10, x6]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "ldr q18, [x11, x6]\n"
+ "fmla v25.4s, v0.4s, v22.4s\n"
+ "fmla v29.4s, v5.4s, v21.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "fmla v29.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x10, x14]\n"
+ "fmla v24.4s, v4.4s, v16.4s\n"
+ "ldr q21, [x9, x6]\n"
+ "fmla v25.4s, v4.4s, v19.4s\n"
+ "ldr q20, [x11, x13]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v28.4s, v6.4s, v22.4s\n"
+ "ld1 { v16.4s }, [x9]\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "fmla v25.4s, v1.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "ldr q19, [x10, x15]\n"
"add x10, x10, #0x10\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmla v29.4s, v8.4s, v20.4s\n"
+ "fmla v24.4s, v5.4s, v23.4s\n"
+ "ldr q18, [x9, x14]\n"
+ "fmla v25.4s, v6.4s, v16.4s\n"
+ "ldr q17, [x9, x15]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "ldr q16, [x9, x13]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.4s, v7.4s, v21.4s\n"
"fmin v28.4s, v28.4s, v27.4s\n"
- "str q28, [x17, x7]\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmla v24.4s, v3.4s, v19.4s\n"
+ "st1 { v28.4s }, [x17]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "str q29, [x17, x7]\n"
"add x17, x17, #0x10\n"
- "fmin v22.4s, v22.4s, v27.4s\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "st1 { v22.4s }, [x28]\n"
- "str q21, [x28, x7]\n"
+ "fmla v24.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
+ "st1 { v25.4s }, [x28]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "str q24, [x28, x7]\n"
"add x28, x28, #0x10\n"
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 43f\n"
- "ldr q31, [x15, #0x0]\n"
- "ldr q0, [x15, #0x10]\n"
- "add x27, x14, x13\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x27, x11, x15\n"
"add x26, x8, XZR\n"
- "ldr q1, [x15, #0x20]\n"
- "ldr q2, [x15, #0x30]\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
"add x25, x8, x6\n"
- "add x24, x8, x11\n"
- "ldr q3, [x15, #0x40]\n"
- "ldr q4, [x15, #0x50]\n"
- "add x23, x8, x9\n"
- "add x22, x16, XZR\n"
- "ldr q5, [x15, #0x60]\n"
- "ldr q6, [x15, #0x70]\n"
- "add x21, x16, x6\n"
- "add x20, x8, x13\n"
- "ldr q7, [x15, #0x80]\n"
- "ldr q8, [x15, #0x90]\n"
+ "add x24, x8, x14\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x23, x8, x13\n"
+ "add x22, x12, XZR\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "add x21, x12, x6\n"
+ "add x20, x8, x15\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"tbz %x[n_channels], #1, 5f\n"
"ldr d9, [x27], #0x8\n"
"ldr d10, [x26], #0x8\n"
@@ -370,18 +370,18 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s16, [x20, #0x0]\n"
"6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
"mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "add x20, x16, x11\n"
"mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
+ "add x20, x12, x14\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 7f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 8f\n"
@@ -391,7 +391,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s11, [x20, #0x0]\n"
"8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v29.4s, v4.4s, v11.4s\n"
- "add x20, x16, x9\n"
+ "add x20, x12, x13\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 10f\n"
@@ -401,7 +401,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s12, [x20, #0x0]\n"
"10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
"fmla v29.4s, v5.4s, v12.4s\n"
- "add x20, x16, x13\n"
+ "add x20, x12, x15\n"
"tbz %x[n_channels], #1, 11f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
@@ -412,7 +412,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
"fmla v28.4s, v5.4s, v13.4s\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "add x20, x12, XZR\n"
+ "add x20, x10, XZR\n"
"tbz %x[n_channels], #1, 13f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 14f\n"
@@ -422,7 +422,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s14, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v30.4s, v3.4s, v14.4s\n"
- "add x20, x14, XZR\n"
+ "add x20, x11, XZR\n"
"tbz %x[n_channels], #1, 15f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
@@ -433,7 +433,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
"fmla v28.4s, v6.4s, v15.4s\n"
"fmla v30.4s, v0.4s, v15.4s\n"
- "add x20, x12, x6\n"
+ "add x20, x10, x6\n"
"tbz %x[n_channels], #1, 17f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
@@ -443,7 +443,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s11, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
"fmla v30.4s, v4.4s, v11.4s\n"
- "add x20, x14, x6\n"
+ "add x20, x11, x6\n"
"tbz %x[n_channels], #1, 19f\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
@@ -454,7 +454,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
"fmla v28.4s, v7.4s, v16.4s\n"
"fmla v30.4s, v1.4s, v16.4s\n"
- "add x20, x12, x11\n"
+ "add x20, x10, x14\n"
"tbz %x[n_channels], #1, 21f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
@@ -464,7 +464,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s13, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
"fmla v31.4s, v4.4s, v13.4s\n"
- "add x20, x14, x11\n"
+ "add x20, x11, x14\n"
"tbz %x[n_channels], #1, 23f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
@@ -475,7 +475,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
"fmla v29.4s, v7.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "add x20, x12, x9\n"
+ "add x20, x10, x13\n"
"tbz %x[n_channels], #1, 25f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
@@ -485,7 +485,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s14, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
"fmla v31.4s, v5.4s, v14.4s\n"
- "add x20, x10, XZR\n"
+ "add x20, x9, XZR\n"
"tbz %x[n_channels], #1, 27f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
@@ -495,7 +495,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s15, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
"fmla v30.4s, v6.4s, v15.4s\n"
- "add x20, x14, x9\n"
+ "add x20, x11, x13\n"
"tbz %x[n_channels], #1, 29f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
@@ -506,7 +506,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
"fmla v29.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "add x20, x10, x6\n"
+ "add x20, x9, x6\n"
"tbz %x[n_channels], #1, 31f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
@@ -516,7 +516,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s13, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
"fmla v30.4s, v7.4s, v13.4s\n"
- "add x20, x12, x13\n"
+ "add x20, x10, x15\n"
"tbz %x[n_channels], #1, 33f\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
@@ -527,7 +527,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
"fmla v30.4s, v5.4s, v16.4s\n"
"fmla v31.4s, v3.4s, v16.4s\n"
- "add x20, x10, x11\n"
+ "add x20, x9, x14\n"
"tbz %x[n_channels], #1, 35f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
@@ -537,7 +537,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"ldr s14, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
"fmla v31.4s, v7.4s, v14.4s\n"
- "add x20, x10, x13\n"
+ "add x20, x9, x15\n"
"tbz %x[n_channels], #1, 37f\n"
"ldr d15, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
@@ -548,7 +548,7 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
"fmla v30.4s, v8.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v15.4s\n"
- "add x20, x10, x9\n"
+ "add x20, x9, x13\n"
"tbz %x[n_channels], #1, 39f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
@@ -561,18 +561,18 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"fmax v28.4s, v28.4s, v26.4s\n"
"fmax v29.4s, v29.4s, v26.4s\n"
"fmax v30.4s, v30.4s, v26.4s\n"
- "fmax v31.4s, v31.4s, v26.4s\n"
"fmin v28.4s, v28.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
"fmin v29.4s, v29.4s, v27.4s\n"
"fmin v30.4s, v30.4s, v27.4s\n"
"fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 41f\n"
"mov x21, x17\n"
"mov x20, x28\n"
- "st1 { v28.d }[0], [x21], x7\n"
- "st1 { v30.d }[0], [x20], x7\n"
"add x17, x17, #0x8\n"
"add x28, x28, #0x8\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 42f\n"
@@ -592,16 +592,16 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.s }[0], [x20]\n"
"42:" // Tile loop: Oddments: Store: Bit 1: End
"43:" // Tile loop: End
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x27, x27, #0x1\n"
- "add x21, x23, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x27, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x23, x23, x21, LT\n"
- "csel x27, x27, XZR, LT\n"
- "cmp x23, x20\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x10, x10, #0x1\n"
+ "add x20, x11, #0x1\n"
+ "cmp x10, x22\n"
+ "csel x11, x11, x20, LT\n"
+ "csel x10, x10, XZR, LT\n"
+ "cmp x11, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 24fe255dfb..3fc1899921 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,275 +87,275 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "lsr x24, %x[n_channels], #0x2\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v26.4s }, [x20]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x2\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.4s }, [x21]\n"
"ld1r { v27.4s }, [x20]\n"
- "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x12, x11, [x21, #0x0]\n"
- "ldp x10, x9, [x21, #0x10]\n"
- "mov x28, #0x0\n"
- "sub x22, XZR, x25\n"
- "cbz x24, 3f\n"
- "ldr q31, [x23, #0x0]\n"
- "ldr q0, [x23, #0x10]\n"
- "cmp x25, x24, LSL #4\n"
- "ldr q1, [x23, #0x20]\n"
- "ldr q2, [x23, #0x30]\n"
- "ldr q3, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr q7, [x23, #0x80]\n"
- "ldr q8, [x23, #0x90]\n"
- "add x23, x23, #0xa0\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x28]\n"
- "ldr q10, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "ldr q11, [x21, x28]\n"
- "ldr q12, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x20]\n"
- "ldr q13, [x21, x28]\n"
- "ldr q14, [x20, x28]\n"
- "ldp x21, x20, [x13, #0x30]\n"
- "ldr q15, [x21, x28]\n"
- "ldr q16, [x20, x28]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x22, #0x0]\n"
+ "ldp x11, x10, [x22, #0x10]\n"
+ "sub x9, XZR, x8\n"
+ "cbz x17, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x8, x17, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q9, [x27, x14]\n"
+ "ldr q10, [x26, x14]\n"
+ "ldr q11, [x25, x14]\n"
+ "ldr q12, [x24, x14]\n"
+ "ldr q13, [x23, x14]\n"
+ "ldr q14, [x22, x14]\n"
+ "ldr q15, [x21, x14]\n"
+ "ldr q16, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
- "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
- "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
- "ldr x21, [x13, #0x40]\n"
- "ldr x20, [x13, #0x48]\n"
- "fmla v24.4s, v0.4s, v10.4s\n"
- "fmla v23.4s, v1.4s, v12.4s\n"
- "ldr q20, [x20, x28]\n"
- "ldr x20, [x13, #0x50]\n"
- "fmla v24.4s, v1.4s, v11.4s\n"
- "ldr q19, [x21, x28]\n"
- "fmla v23.4s, v2.4s, v13.4s\n"
- "ldr q18, [x20, x28]\n"
- "fmla v24.4s, v3.4s, v14.4s\n"
- "fmla v23.4s, v0.4s, v16.4s\n"
- "ldr x20, [x13, #0x58]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v24.4s, v4.4s, v15.4s\n"
- "fmla v23.4s, v4.4s, v19.4s\n"
- "ldr x21, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "ldr q22, [x20, x28]\n"
- "fmla v24.4s, v2.4s, v16.4s\n"
- "fmla v23.4s, v5.4s, v20.4s\n"
- "ldr x20, [x13, #0x80]\n"
- "ldr q21, [x20, x28]\n"
- "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
- "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
- "ldr q31, [x23, #0x0]\n"
- "fmla v24.4s, v5.4s, v18.4s\n"
- "fmla v23.4s, v3.4s, v18.4s\n"
- "ldr q16, [x21, x28]\n"
- "ldr x20, [x13, #0x68]\n"
- "ldr q18, [x20, x28]\n"
- "fmla v20.4s, v3.4s, v17.4s\n"
- "fmla v19.4s, v4.4s, v16.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v20.4s, v0.4s, v22.4s\n"
- "ldr q0, [x23, #0x10]\n"
- "fmla v19.4s, v1.4s, v21.4s\n"
- "ldr x20, [x13, #0x70]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v20.4s, v4.4s, v18.4s\n"
- "fmla v19.4s, v5.4s, v16.4s\n"
- "ldr q4, [x23, #0x50]\n"
- "ldr x20, [x13, #0x98]\n"
- "fmla v24.4s, v6.4s, v22.4s\n"
- "fmla v20.4s, v1.4s, v17.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr q1, [x23, #0x20]\n"
- "fmla v19.4s, v2.4s, v16.4s\n"
- "fmla v24.4s, v7.4s, v17.4s\n"
- "ldr q2, [x23, #0x30]\n"
- "ldr x20, [x13, #0x90]\n"
- "fmla v23.4s, v7.4s, v21.4s\n"
- "fmla v23.4s, v8.4s, v16.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0xa8]\n"
- "fmla v20.4s, v6.4s, v16.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "ldr x28, [x15, #0x40]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr x27, [x15, #0x78]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x24, [x15, #0x60]\n"
+ "ldr x26, [x15, #0x68]\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x21, x14]\n"
+ "ldr x23, [x15, #0x88]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x28, x14]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x25, x14]\n"
+ "ldr x25, [x15, #0x90]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr q23, [x23, x14]\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ldr q22, [x24, x14]\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v25.4s, v0.4s, v22.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla v28.4s, v5.4s, v21.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v25.4s, v4.4s, v17.4s\n"
+ "ldr q21, [x20, x14]\n"
+ "fmla v29.4s, v5.4s, v20.4s\n"
+ "fmla v28.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x27, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v24.4s, v4.4s, v16.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v29.4s, v6.4s, v22.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v25.4s, v1.4s, v19.4s\n"
+ "fmla v24.4s, v1.4s, v18.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v29.4s, v7.4s, v19.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v24.4s, v5.4s, v23.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.4s, v8.4s, v21.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v24.4s, v2.4s, v21.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "fmla v25.4s, v7.4s, v20.4s\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "ldr q9, [x27, x8]\n"
+ "ldr q10, [x26, x8]\n"
+ "fmla v24.4s, v3.4s, v16.4s\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "ldr q12, [x24, x8]\n"
+ "ldr q13, [x23, x8]\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x20, x8]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "str q29, [x13, x9]\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "ldr q14, [x22, x8]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v25.4s, v8.4s, v18.4s\n"
+ "str q28, [x12, x9]\n"
+ "fmla v24.4s, v6.4s, v18.4s\n"
+ "ldr q15, [x21, x8]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "ldr q11, [x25, x8]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x8, x8, #0x10\n"
+ "add x16, x16, #0xa0\n"
+ "cmp x8, x17, LSL #4\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
"fmax v24.4s, v24.4s, v26.4s\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0xa0]\n"
- "fmla v19.4s, v3.4s, v17.4s\n"
- "fmax v23.4s, v23.4s, v26.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr q3, [x23, #0x40]\n"
- "fmla v20.4s, v7.4s, v16.4s\n"
- "fmla v20.4s, v5.4s, v17.4s\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr x20, [x13, #0xb0]\n"
- "add x22, x22, #0x10\n"
"fmin v24.4s, v24.4s, v27.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr x20, [x13, #0xb8]\n"
- "fmla v19.4s, v7.4s, v16.4s\n"
- "fmin v23.4s, v23.4s, v27.4s\n"
- "ldr q16, [x20, x28]\n"
- "ldr q7, [x23, #0x80]\n"
- "fmla v19.4s, v6.4s, v16.4s\n"
- "fmla v20.4s, v8.4s, v16.4s\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr x20, [x13, #0xc0]\n"
- "fmax v20.4s, v20.4s, v26.4s\n"
- "fmin v20.4s, v20.4s, v27.4s\n"
- "ldr q16, [x20, x28]\n"
- "fmla v19.4s, v8.4s, v16.4s\n"
- "ldr q8, [x23, #0x90]\n"
- "fmax v19.4s, v19.4s, v26.4s\n"
- "ldp x21, x20, [x13, #0x0]\n"
- "ldr q9, [x21, x25]\n"
- "fmin v19.4s, v19.4s, v27.4s\n"
- "add x28, x28, #0x10\n"
- "ldr q10, [x20, x25]\n"
- "ldp x21, x20, [x13, #0x10]\n"
- "str q24, [x12, x22]\n"
- "add x23, x23, #0xa0\n"
- "ldr q11, [x21, x25]\n"
- "ldr q12, [x20, x25]\n"
- "str q23, [x11, x22]\n"
- "ldp x21, x20, [x13, #0x20]\n"
- "ldr q13, [x21, x25]\n"
- "str q20, [x10, x22]\n"
- "ldr q14, [x20, x25]\n"
- "ldp x21, x20, [x13, #0x30]\n"
- "str q19, [x9, x22]\n"
- "ldr q15, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
- "add x25, x25, #0x10\n"
- "cmp x25, x24, LSL #4\n"
+ "str q25, [x11, x9]\n"
+ "str q24, [x10, x9]\n"
"blt 1b\n"
"2:" // Channel tail
- "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
- "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
- "ldr x21, [x13, #0x40]\n"
- "ldr x20, [x13, #0x48]\n"
- "fmla v25.4s, v0.4s, v10.4s\n"
- "fmla v24.4s, v1.4s, v12.4s\n"
- "ldr q20, [x20, x28]\n"
- "ldr x20, [x13, #0x50]\n"
- "fmla v25.4s, v1.4s, v11.4s\n"
- "ldr q18, [x21, x28]\n"
- "fmla v24.4s, v2.4s, v13.4s\n"
- "ldr q19, [x20, x28]\n"
- "fmla v25.4s, v3.4s, v14.4s\n"
- "fmla v24.4s, v0.4s, v16.4s\n"
- "ldr x20, [x13, #0x58]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v25.4s, v4.4s, v15.4s\n"
- "fmla v24.4s, v4.4s, v18.4s\n"
- "ldr x21, [x13, #0x78]\n"
- "ldr x20, [x13, #0x60]\n"
- "ldr q23, [x20, x28]\n"
- "fmla v25.4s, v2.4s, v16.4s\n"
- "fmla v24.4s, v5.4s, v20.4s\n"
- "ldr x20, [x13, #0x80]\n"
- "ldr q22, [x20, x28]\n"
- "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
- "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
- "ldr x20, [x13, #0x68]\n"
- "ldr q18, [x20, x28]\n"
- "fmla v25.4s, v5.4s, v19.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "ldr x28, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x26, [x15, #0x50]\n"
+ "ldr x25, [x15, #0x58]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v2.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v0.4s, v9.4s\n"
+ "ldr x27, [x15, #0x78]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x20, x14]\n"
+ "ldr x21, [x15, #0x88]\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x28, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x26, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "ldr x25, [x15, #0x98]\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "ldr q23, [x24, x14]\n"
+ "ldr x24, [x15, #0xa0]\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr q22, [x21, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x23, x14]\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "fmla v25.4s, v0.4s, v23.4s\n"
+ "fmla v29.4s, v5.4s, v21.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "fmla v29.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x27, x14]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v24.4s, v4.4s, v16.4s\n"
+ "ldr q21, [x24, x14]\n"
+ "fmla v25.4s, v4.4s, v19.4s\n"
+ "ldr q20, [x25, x14]\n"
+ "fmla v28.4s, v6.4s, v23.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "fmla v25.4s, v1.4s, v18.4s\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v29.4s, v8.4s, v20.4s\n"
+ "fmla v24.4s, v5.4s, v22.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v25.4s, v6.4s, v16.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v25.4s, v7.4s, v21.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
"fmla v24.4s, v3.4s, v19.4s\n"
- "ldr q16, [x21, x28]\n"
- "fmla v21.4s, v3.4s, v17.4s\n"
- "fmla v20.4s, v4.4s, v16.4s\n"
- "ldr x20, [x13, #0x88]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.4s, v0.4s, v23.4s\n"
- "fmla v20.4s, v1.4s, v22.4s\n"
- "ldr x20, [x13, #0x70]\n"
- "ldr q17, [x20, x28]\n"
- "ldr x20, [x13, #0x98]\n"
- "fmla v21.4s, v4.4s, v18.4s\n"
- "ldr q19, [x20, x28]\n"
- "fmla v20.4s, v5.4s, v16.4s\n"
- "fmla v25.4s, v6.4s, v23.4s\n"
- "ldr x20, [x13, #0x90]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.4s, v1.4s, v17.4s\n"
- "ldr x20, [x13, #0xa8]\n"
- "fmla v20.4s, v2.4s, v19.4s\n"
- "fmla v25.4s, v7.4s, v17.4s\n"
- "ldr q18, [x20, x28]\n"
- "ldr x20, [x13, #0xa0]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v21.4s, v6.4s, v16.4s\n"
- "fmla v20.4s, v3.4s, v18.4s\n"
- "ldr x20, [x13, #0xb0]\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.4s, v7.4s, v17.4s\n"
- "fmla v20.4s, v7.4s, v16.4s\n"
- "ldr x20, [x13, #0xb8]\n"
- "ldr q17, [x20, x28]\n"
- "fmla v24.4s, v7.4s, v22.4s\n"
- "fmla v21.4s, v5.4s, v18.4s\n"
- "ldr x20, [x13, #0xc0]\n"
- "fmla v20.4s, v6.4s, v17.4s\n"
- "fmla v24.4s, v8.4s, v19.4s\n"
- "ldr q16, [x20, x28]\n"
- "fmla v21.4s, v8.4s, v17.4s\n"
- "fmla v20.4s, v8.4s, v16.4s\n"
+ "str q28, [x13, x9]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "str q29, [x12, x9]\n"
+ "fmla v24.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
"fmax v25.4s, v25.4s, v26.4s\n"
- "add x22, x22, #0x10\n"
- "fmax v24.4s, v24.4s, v26.4s\n"
- "fmax v21.4s, v21.4s, v26.4s\n"
- "add x28, x28, #0x10\n"
- "fmax v20.4s, v20.4s, v26.4s\n"
"fmin v25.4s, v25.4s, v27.4s\n"
- "str q25, [x12, x22]\n"
+ "fmla v24.4s, v8.4s, v16.4s\n"
+ "str q25, [x11, x9]\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
"fmin v24.4s, v24.4s, v27.4s\n"
- "fmin v21.4s, v21.4s, v27.4s\n"
- "str q24, [x11, x22]\n"
- "fmin v20.4s, v20.4s, v27.4s\n"
- "str q21, [x10, x22]\n"
- "str q20, [x9, x22]\n"
+ "str q24, [x10, x9]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 42f\n"
- "ldr q31, [x23, #0x0]\n"
- "ldr q0, [x23, #0x10]\n"
- "mov x20, x28\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x20, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "add x13, x13, x20\n"
"add x12, x12, x20\n"
- "ldr q1, [x23, #0x20]\n"
- "ldr q2, [x23, #0x30]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
"add x11, x11, x20\n"
"add x10, x10, x20\n"
- "ldr q3, [x23, #0x40]\n"
- "ldr q4, [x23, #0x50]\n"
- "add x9, x9, x20\n"
- "ldr q5, [x23, #0x60]\n"
- "ldr q6, [x23, #0x70]\n"
- "ldr q7, [x23, #0x80]\n"
- "ldr q8, [x23, #0x90]\n"
- "ldr x27, [x13, #0x0]\n"
- "ldr x26, [x13, #0x8]\n"
- "add x27, x27, x28\n"
- "add x26, x26, x28\n"
- "ldr x25, [x13, #0x10]\n"
- "ldr x24, [x13, #0x18]\n"
- "add x25, x25, x28\n"
- "add x24, x24, x28\n"
- "ldr x23, [x13, #0x20]\n"
- "ldr x22, [x13, #0x28]\n"
- "add x23, x23, x28\n"
- "add x22, x22, x28\n"
- "ldr x21, [x13, #0x30]\n"
- "ldr x20, [x13, #0x38]\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "ldr x27, [x15, #0x0]\n"
+ "ldr x26, [x15, #0x8]\n"
+ "ldr x25, [x15, #0x10]\n"
+ "ldr x24, [x15, #0x18]\n"
+ "ldr x23, [x15, #0x20]\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "ldr x20, [x15, #0x38]\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v9.d }[0], [x27], #0x8\n"
"ld1 { v10.d }[0], [x26], #0x8\n"
@@ -386,19 +386,19 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1 { v16.s }[0], [x20], #0x4\n"
"5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
"mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
- "fmla v28.4s, v0.4s, v10.4s\n"
- "ldr x20, [x13, #0x40]\n"
- "add x20, x20, x28\n"
"mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
- "fmla v28.4s, v1.4s, v11.4s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
- "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
"fmla v28.4s, v4.4s, v15.4s\n"
- "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
- "fmla v31.4s, v0.4s, v9.4s\n"
"fmla v28.4s, v2.4s, v16.4s\n"
- "fmla v29.4s, v0.4s, v16.4s\n"
"tbz %x[n_channels], #1, 6f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 7f\n"
@@ -407,9 +407,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"6:" // Oddments: Load input (1, 3): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"7:" // Oddments: Load input (1, 3): Bit 1: End
- "ldr x20, [x13, #0x48]\n"
+ "ldr x20, [x15, #0x48]\n"
"fmla v29.4s, v4.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -418,9 +418,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"8:" // Oddments: Load input (1, 4): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"9:" // Oddments: Load input (1, 4): Bit 1: End
- "ldr x20, [x13, #0x50]\n"
+ "ldr x20, [x15, #0x50]\n"
"fmla v29.4s, v5.4s, v12.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -429,10 +429,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"10:" // Oddments: Load input (1, 2): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"11:" // Oddments: Load input (1, 2): Bit 1: End
- "ldr x20, [x13, #0x58]\n"
+ "ldr x20, [x15, #0x58]\n"
"fmla v28.4s, v5.4s, v13.4s\n"
"fmla v29.4s, v3.4s, v13.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -441,9 +441,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"12:" // Oddments: Load input (3, 0): Bit 1: Unset
"ld1 { v14.s }[0], [x20], #0x4\n"
"13:" // Oddments: Load input (3, 0): Bit 1: End
- "ldr x20, [x13, #0x60]\n"
+ "ldr x20, [x15, #0x60]\n"
"fmla v30.4s, v3.4s, v14.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -452,10 +452,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"14:" // Oddments: Load input (2, 0): Bit 1: Unset
"ld1 { v15.s }[0], [x20], #0x4\n"
"15:" // Oddments: Load input (2, 0): Bit 1: End
- "ldr x20, [x13, #0x68]\n"
+ "ldr x20, [x15, #0x68]\n"
"fmla v28.4s, v6.4s, v15.4s\n"
"fmla v30.4s, v0.4s, v15.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -464,9 +464,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"16:" // Oddments: Load input (3, 1): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"17:" // Oddments: Load input (3, 1): Bit 1: End
- "ldr x20, [x13, #0x70]\n"
+ "ldr x20, [x15, #0x70]\n"
"fmla v30.4s, v4.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -475,10 +475,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"18:" // Oddments: Load input (2, 1): Bit 1: Unset
"ld1 { v16.s }[0], [x20], #0x4\n"
"19:" // Oddments: Load input (2, 1): Bit 1: End
- "ldr x20, [x13, #0x78]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla v28.4s, v7.4s, v16.4s\n"
"fmla v30.4s, v1.4s, v16.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -487,9 +487,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"20:" // Oddments: Load input (3, 3): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"21:" // Oddments: Load input (3, 3): Bit 1: End
- "ldr x20, [x13, #0x80]\n"
+ "ldr x20, [x15, #0x80]\n"
"fmla v31.4s, v4.4s, v13.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -498,10 +498,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"22:" // Oddments: Load input (2, 3): Bit 1: Unset
"ld1 { v12.s }[0], [x20], #0x4\n"
"23:" // Oddments: Load input (2, 3): Bit 1: End
- "ldr x20, [x13, #0x88]\n"
+ "ldr x20, [x15, #0x88]\n"
"fmla v29.4s, v7.4s, v12.4s\n"
"fmla v31.4s, v1.4s, v12.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -510,9 +510,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"24:" // Oddments: Load input (3, 4): Bit 1: Unset
"ld1 { v14.s }[0], [x20], #0x4\n"
"25:" // Oddments: Load input (3, 4): Bit 1: End
- "ldr x20, [x13, #0x90]\n"
+ "ldr x20, [x15, #0x90]\n"
"fmla v31.4s, v5.4s, v14.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -521,9 +521,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"26:" // Oddments: Load input (4, 0): Bit 1: Unset
"ld1 { v15.s }[0], [x20], #0x4\n"
"27:" // Oddments: Load input (4, 0): Bit 1: End
- "ldr x20, [x13, #0x98]\n"
+ "ldr x20, [x15, #0x98]\n"
"fmla v30.4s, v6.4s, v15.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -532,10 +532,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"28:" // Oddments: Load input (2, 4): Bit 1: Unset
"ld1 { v11.s }[0], [x20], #0x4\n"
"29:" // Oddments: Load input (2, 4): Bit 1: End
- "ldr x20, [x13, #0xa0]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v29.4s, v8.4s, v11.4s\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 30f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
@@ -544,9 +544,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"30:" // Oddments: Load input (4, 1): Bit 1: Unset
"ld1 { v13.s }[0], [x20], #0x4\n"
"31:" // Oddments: Load input (4, 1): Bit 1: End
- "ldr x20, [x13, #0xa8]\n"
+ "ldr x20, [x15, #0xa8]\n"
"fmla v30.4s, v7.4s, v13.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 32f\n"
"ld1 { v16.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
@@ -555,10 +555,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"32:" // Oddments: Load input (3, 2): Bit 1: Unset
"ld1 { v16.s }[0], [x20], #0x4\n"
"33:" // Oddments: Load input (3, 2): Bit 1: End
- "ldr x20, [x13, #0xb0]\n"
+ "ldr x20, [x15, #0xb0]\n"
"fmla v30.4s, v5.4s, v16.4s\n"
"fmla v31.4s, v3.4s, v16.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -567,9 +567,9 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"34:" // Oddments: Load input (4, 3): Bit 1: Unset
"ld1 { v14.s }[0], [x20], #0x4\n"
"35:" // Oddments: Load input (4, 3): Bit 1: End
- "ldr x20, [x13, #0xb8]\n"
+ "ldr x20, [x15, #0xb8]\n"
"fmla v31.4s, v7.4s, v14.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 36f\n"
"ld1 { v15.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
@@ -578,10 +578,10 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"36:" // Oddments: Load input (4, 2): Bit 1: Unset
"ld1 { v15.s }[0], [x20], #0x4\n"
"37:" // Oddments: Load input (4, 2): Bit 1: End
- "ldr x20, [x13, #0xc0]\n"
+ "ldr x20, [x15, #0xc0]\n"
"fmla v30.4s, v8.4s, v15.4s\n"
"fmla v31.4s, v6.4s, v15.4s\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
@@ -594,32 +594,32 @@ void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"fmax v28.4s, v28.4s, v26.4s\n"
"fmax v29.4s, v29.4s, v26.4s\n"
"fmax v30.4s, v30.4s, v26.4s\n"
- "fmax v31.4s, v31.4s, v26.4s\n"
"fmin v28.4s, v28.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
"fmin v29.4s, v29.4s, v27.4s\n"
"fmin v30.4s, v30.4s, v27.4s\n"
"fmin v31.4s, v31.4s, v27.4s\n"
"tbz %x[n_channels], #1, 40f\n"
- "st1 { v28.d }[0], [x12], #0x8\n"
- "st1 { v29.d }[0], [x11], #0x8\n"
- "st1 { v30.d }[0], [x10], #0x8\n"
- "st1 { v31.d }[0], [x9], #0x8\n"
+ "st1 { v28.d }[0], [x13], #0x8\n"
+ "st1 { v29.d }[0], [x12], #0x8\n"
+ "st1 { v30.d }[0], [x11], #0x8\n"
+ "st1 { v31.d }[0], [x10], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
- "st1 { v28.s }[2], [x12], #0x4\n"
- "st1 { v29.s }[2], [x11], #0x4\n"
- "st1 { v30.s }[2], [x10], #0x4\n"
- "st1 { v31.s }[2], [x9], #0x4\n"
+ "st1 { v28.s }[2], [x13], #0x4\n"
+ "st1 { v29.s }[2], [x12], #0x4\n"
+ "st1 { v30.s }[2], [x11], #0x4\n"
+ "st1 { v31.s }[2], [x10], #0x4\n"
"b 41f\n"
"40:" // Oddments: Store: Bit 1: Unset
- "st1 { v28.s }[0], [x12], #0x4\n"
- "st1 { v29.s }[0], [x11], #0x4\n"
- "st1 { v30.s }[0], [x10], #0x4\n"
- "st1 { v31.s }[0], [x9], #0x4\n"
+ "st1 { v28.s }[0], [x13], #0x4\n"
+ "st1 { v29.s }[0], [x12], #0x4\n"
+ "st1 { v30.s }[0], [x11], #0x4\n"
+ "st1 { v31.s }[0], [x10], #0x4\n"
"41:" // Oddments: Store: Bit 1: End
"42:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 3426fbc3f9..e35f4fdf4e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,251 +87,251 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
);
__asm__ __volatile__(
- "mov x27, #0x0\n"
- "mov x26, #0x0\n"
+ "mov x11, #0x0\n"
+ "mov x10, #0x0\n"
"1:" // Tile loop
- "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x23, #0x2\n"
- "mov x25, #0x2\n"
- "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x9, #0x2\n"
+ "mov x28, #0x2\n"
+ "str x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
"ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "lsl x2, x2, #0x2\n"
- "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
- "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x6, x2, x2\n"
- "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
- "add x4, x4, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x7, x4, x24, LSL #2\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
- "add x17, x7, x24, LSL #2\n"
- "mov x23, #0x10\n" // cntb _, ALL, #1
- "mul x20, x20, x25\n" // offset *= output_tile_size
- "lsr x22, %x[n_channels], #0x2\n"
- "add x16, x17, x24, LSL #2\n"
- "add x15, x6, x2\n"
- "add x14, x16, x24, LSL #2\n"
- "add x13, x15, x2\n"
- "add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "lsr x24, %x[n_channels], #0x2\n"
"add x20, %x[params_struct], %[offsetof_args_min]\n"
"ld1r { v27.4s }, [x20]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "mov x23, #0x0\n"
"ld1r { v15.4s }, [x20]\n"
- "add x12, x14, x24, LSL #2\n"
- "add x11, x13, x2\n"
- "add x10, x5, x21, LSL #2\n"
+ "mul x22, x11, x27\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "sub x21, XZR, x26\n"
+ "mul x20, x11, x25\n" // offset = tile_i * ld_output_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x22, x10, x2, x22\n" // offset += tile_j * ld_input_col
+ "lsl x2, x2, #0x2\n"
+ "madd x20, x10, x3, x20\n" // offset += tile_j * ld_output_col
"lsl x3, x3, #0x2\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "cbz x22, 4f\n"
- "ldr q25, [x8, #0x0]\n"
- "ldr q0, [x8, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
- "ldr q1, [x8, #0x20]\n"
- "ldr q2, [x8, #0x30]\n"
- "ldr q3, [x8, #0x40]\n"
- "ldr q4, [x8, #0x50]\n"
- "add x8, x8, #0x60\n"
+ "mul x22, x22, x9\n" // offset *= kernel_stride * output_size
+ "add x7, x2, x2\n"
+ "add x8, x7, x2\n"
+ "add x17, x8, x2\n"
+ "mul x20, x20, x28\n" // offset *= output_tile_size
+ "add x16, x17, x2\n"
+ "add x4, x4, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x15, x4, x27, LSL #2\n"
+ "add x14, x15, x27, LSL #2\n"
+ "add x13, x14, x27, LSL #2\n"
+ "add x12, x13, x27, LSL #2\n"
+ "add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x11, x12, x27, LSL #2\n"
+ "add x10, x5, x25, LSL #2\n"
+ "cbz x24, 4f\n"
+ "ldr q25, [x6, #0x0]\n"
+ "ldr q0, [x6, #0x10]\n"
+ "cmp x26, x24, LSL #4\n"
+ "ldr q1, [x6, #0x20]\n"
+ "ldr q2, [x6, #0x30]\n"
+ "ldr q3, [x6, #0x40]\n"
+ "ldr q4, [x6, #0x50]\n"
+ "add x6, x6, #0x60\n"
"ld1 { v5.4s }, [x4]\n"
"ldr q6, [x4, x2]\n"
- "ld1 { v7.4s }, [x7]\n"
- "ldr q8, [x7, x2]\n"
- "ldr q9, [x4, x6]\n"
- "ldr q13, [x7, x6]\n"
- "ldr q11, [x4, x15]\n"
- "ldr q12, [x4, x13]\n"
- "ldr q10, [x7, x11]\n"
- "ld1 { v14.4s }, [x17]\n"
+ "ld1 { v7.4s }, [x15]\n"
+ "ldr q8, [x15, x2]\n"
+ "ldr q9, [x4, x7]\n"
+ "ldr q13, [x15, x7]\n"
+ "ldr q11, [x4, x8]\n"
+ "ldr q12, [x4, x17]\n"
+ "ldr q10, [x15, x16]\n"
+ "ld1 { v14.4s }, [x14]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
"mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
- "ldr q23, [x7, x15]\n"
+ "ldr q23, [x15, x8]\n"
"mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
- "add x23, x23, #0x10\n"
+ "add x26, x26, #0x10\n"
"mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
"mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
- "ldr q19, [x8, #0x0]\n"
- "ldr q25, [x8, #0x140]\n"
+ "ldr q19, [x6, #0x0]\n"
+ "ldr q25, [x6, #0x140]\n"
+ "cmp x26, x24, LSL #4\n"
+ "add x21, x21, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla v30.4s, v1.4s, v6.4s\n"
- "ldr q21, [x7, x13]\n"
+ "ldr q21, [x15, x17]\n"
+ "add x15, x15, #0x10\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "add x7, x7, #0x10\n"
"fmla v29.4s, v1.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v13.4s\n"
- "ldr q1, [x8, #0x10]\n"
- "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x6, #0x10]\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q18, [x4, x11]\n"
- "fmla v31.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x4, x16]\n"
"add x4, x4, #0x10\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
"fmla v28.4s, v2.4s, v23.4s\n"
- "ldr q17, [x8, #0x20]\n"
- "add x20, x20, #0x10\n"
+ "ldr q17, [x6, #0x20]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "ldr q6, [x17, x2]\n"
+ "ldr q6, [x14, x2]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "add x21, x21, #0x10\n"
"fmla v29.4s, v3.4s, v23.4s\n"
"fmla v28.4s, v3.4s, v21.4s\n"
- "ldr q16, [x8, #0x30]\n"
+ "ldr q16, [x6, #0x30]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "ldr q2, [x17, x6]\n"
+ "ldr q2, [x14, x7]\n"
"fmla v31.4s, v4.4s, v18.4s\n"
- "ldr q0, [x17, x15]\n"
+ "ldr q0, [x14, x8]\n"
"fmla v29.4s, v4.4s, v21.4s\n"
"fmla v28.4s, v4.4s, v10.4s\n"
- "ldr q20, [x8, #0x40]\n"
+ "ldr q20, [x6, #0x40]\n"
"fmla v30.4s, v19.4s, v7.4s\n"
- "ld1 { v7.4s }, [x7]\n"
+ "ld1 { v7.4s }, [x15]\n"
"fmla v31.4s, v19.4s, v8.4s\n"
"fmla v29.4s, v19.4s, v14.4s\n"
"fmla v28.4s, v19.4s, v6.4s\n"
- "ldr q19, [x8, #0x50]\n"
+ "ldr q19, [x6, #0x50]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
- "ldr q26, [x17, x11]\n"
+ "ldr q26, [x14, x16]\n"
"fmla v31.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v6.4s\n"
"fmla v28.4s, v1.4s, v2.4s\n"
- "ldr q18, [x8, #0x60]\n"
+ "ldr q18, [x6, #0x60]\n"
"fmla v30.4s, v17.4s, v13.4s\n"
- "ldr q1, [x17, x13]\n"
+ "ldr q1, [x14, x17]\n"
+ "add x14, x14, #0x10\n"
"fmla v31.4s, v17.4s, v23.4s\n"
- "add x17, x17, #0x10\n"
"fmla v29.4s, v17.4s, v2.4s\n"
"fmla v28.4s, v17.4s, v0.4s\n"
- "ldr q17, [x8, #0x70]\n"
+ "ldr q17, [x6, #0x70]\n"
"fmla v30.4s, v16.4s, v23.4s\n"
- "ld1 { v24.4s }, [x16]\n"
+ "ld1 { v24.4s }, [x13]\n"
"fmla v31.4s, v16.4s, v21.4s\n"
"fmla v29.4s, v16.4s, v0.4s\n"
"fmla v28.4s, v16.4s, v1.4s\n"
- "ldr q16, [x8, #0x80]\n"
+ "ldr q16, [x6, #0x80]\n"
"fmla v30.4s, v20.4s, v21.4s\n"
- "ldr q23, [x16, x2]\n"
+ "ldr q23, [x13, x2]\n"
"fmla v31.4s, v20.4s, v10.4s\n"
- "ldr q22, [x16, x6]\n"
+ "ldr q22, [x13, x7]\n"
"fmla v29.4s, v20.4s, v1.4s\n"
"fmla v28.4s, v20.4s, v26.4s\n"
- "ldr q21, [x8, #0x90]\n"
+ "ldr q21, [x6, #0x90]\n"
"fmla v30.4s, v19.4s, v14.4s\n"
- "ldr q5, [x16, x11]\n"
+ "ldr q5, [x13, x16]\n"
"fmla v31.4s, v19.4s, v6.4s\n"
"fmla v29.4s, v19.4s, v24.4s\n"
"fmla v28.4s, v19.4s, v23.4s\n"
- "ldr q11, [x8, #0xa0]\n"
+ "ldr q11, [x6, #0xa0]\n"
"fmla v30.4s, v18.4s, v6.4s\n"
- "ldr q20, [x16, x15]\n"
+ "ldr q20, [x13, x8]\n"
"fmla v31.4s, v18.4s, v2.4s\n"
"fmla v29.4s, v18.4s, v23.4s\n"
"fmla v28.4s, v18.4s, v22.4s\n"
- "ldr q18, [x8, #0xb0]\n"
+ "ldr q18, [x6, #0xb0]\n"
"fmla v30.4s, v17.4s, v2.4s\n"
- "ldr q19, [x16, x13]\n"
+ "ldr q19, [x13, x17]\n"
+ "add x13, x13, #0x10\n"
"fmla v31.4s, v17.4s, v0.4s\n"
- "add x16, x16, #0x10\n"
"fmla v29.4s, v17.4s, v22.4s\n"
"fmla v28.4s, v17.4s, v20.4s\n"
- "ldr q17, [x8, #0xc0]\n"
+ "ldr q17, [x6, #0xc0]\n"
"fmla v30.4s, v16.4s, v0.4s\n"
- "ld1 { v0.4s }, [x14]\n"
+ "ld1 { v0.4s }, [x12]\n"
"fmla v31.4s, v16.4s, v1.4s\n"
"fmla v29.4s, v16.4s, v20.4s\n"
"fmla v28.4s, v16.4s, v19.4s\n"
- "ldr q16, [x8, #0xd0]\n"
+ "ldr q16, [x6, #0xd0]\n"
"fmla v30.4s, v21.4s, v1.4s\n"
- "ldr q4, [x14, x2]\n"
+ "ldr q4, [x12, x2]\n"
"fmla v31.4s, v21.4s, v26.4s\n"
- "ldr q12, [x14, x13]\n"
+ "ldr q12, [x12, x17]\n"
"fmla v29.4s, v21.4s, v19.4s\n"
"fmla v28.4s, v21.4s, v5.4s\n"
- "ldr q13, [x8, #0xe0]\n"
+ "ldr q13, [x6, #0xe0]\n"
"fmla v30.4s, v11.4s, v24.4s\n"
- "ldr q6, [x14, x6]\n"
+ "ldr q6, [x12, x7]\n"
"fmla v31.4s, v11.4s, v23.4s\n"
"fmla v29.4s, v11.4s, v0.4s\n"
"fmla v28.4s, v11.4s, v4.4s\n"
- "ldr q24, [x8, #0xf0]\n"
+ "ldr q24, [x6, #0xf0]\n"
"fmla v30.4s, v18.4s, v23.4s\n"
- "ldr q26, [x14, x15]\n"
+ "ldr q26, [x12, x8]\n"
"fmla v31.4s, v18.4s, v22.4s\n"
"fmla v29.4s, v18.4s, v4.4s\n"
"fmla v28.4s, v18.4s, v6.4s\n"
- "ldr q23, [x8, #0x100]\n"
+ "ldr q23, [x6, #0x100]\n"
"fmla v30.4s, v17.4s, v22.4s\n"
- "ldr q22, [x14, x11]\n"
+ "ldr q22, [x12, x16]\n"
+ "add x12, x12, #0x10\n"
"fmla v31.4s, v17.4s, v20.4s\n"
- "add x14, x14, #0x10\n"
"fmla v29.4s, v17.4s, v6.4s\n"
"fmla v28.4s, v17.4s, v26.4s\n"
- "ldr q21, [x8, #0x110]\n"
+ "ldr q21, [x6, #0x110]\n"
"fmla v30.4s, v16.4s, v20.4s\n"
- "ld1 { v18.4s }, [x12]\n"
+ "ld1 { v18.4s }, [x11]\n"
"fmla v31.4s, v16.4s, v19.4s\n"
"fmla v29.4s, v16.4s, v26.4s\n"
"fmla v28.4s, v16.4s, v12.4s\n"
- "ldr q20, [x8, #0x120]\n"
+ "ldr q20, [x6, #0x120]\n"
"fmla v30.4s, v13.4s, v19.4s\n"
- "ldr q17, [x12, x2]\n"
+ "ldr q17, [x11, x2]\n"
"fmla v31.4s, v13.4s, v5.4s\n"
- "ld1 { v14.4s }, [x17]\n"
+ "ld1 { v14.4s }, [x14]\n"
"fmla v29.4s, v13.4s, v12.4s\n"
"fmla v28.4s, v13.4s, v22.4s\n"
- "ldr q19, [x8, #0x130]\n"
+ "ldr q19, [x6, #0x130]\n"
"fmla v30.4s, v24.4s, v0.4s\n"
- "ldr q16, [x12, x6]\n"
+ "ldr q16, [x11, x7]\n"
"fmla v31.4s, v24.4s, v4.4s\n"
"fmla v29.4s, v24.4s, v18.4s\n"
- "ldr q18, [x12, x15]\n"
+ "ldr q18, [x11, x8]\n"
"fmla v28.4s, v24.4s, v17.4s\n"
- "ldr q0, [x8, #0x150]\n"
+ "ldr q0, [x6, #0x150]\n"
"fmla v30.4s, v23.4s, v4.4s\n"
- "ldr q13, [x7, x6]\n"
+ "ldr q13, [x15, x7]\n"
"fmla v31.4s, v23.4s, v6.4s\n"
"fmla v29.4s, v23.4s, v17.4s\n"
- "ldr q17, [x12, x13]\n"
+ "ldr q17, [x11, x17]\n"
"fmla v28.4s, v23.4s, v16.4s\n"
- "ldr q1, [x8, #0x160]\n"
+ "ldr q1, [x6, #0x160]\n"
"fmla v30.4s, v21.4s, v6.4s\n"
"ld1 { v5.4s }, [x4]\n"
"fmla v31.4s, v21.4s, v26.4s\n"
"fmla v29.4s, v21.4s, v16.4s\n"
- "ldr q16, [x12, x11]\n"
+ "ldr q16, [x11, x16]\n"
+ "add x11, x11, #0x10\n"
"fmla v28.4s, v21.4s, v18.4s\n"
- "ldr q2, [x8, #0x170]\n"
+ "ldr q2, [x6, #0x170]\n"
"fmla v30.4s, v20.4s, v26.4s\n"
"ldr q6, [x4, x2]\n"
"fmla v31.4s, v20.4s, v12.4s\n"
- "add x12, x12, #0x10\n"
"fmla v29.4s, v20.4s, v18.4s\n"
- "ldr q11, [x4, x15]\n"
+ "ldr q11, [x4, x8]\n"
"fmla v28.4s, v20.4s, v17.4s\n"
- "ldr q3, [x8, #0x180]\n"
+ "ldr q3, [x6, #0x180]\n"
"fmla v30.4s, v19.4s, v12.4s\n"
- "ldr q8, [x7, x2]\n"
+ "ldr q8, [x15, x2]\n"
"fmla v31.4s, v19.4s, v22.4s\n"
- "ldr q10, [x7, x11]\n"
+ "ldr q10, [x15, x16]\n"
"fmla v29.4s, v19.4s, v17.4s\n"
- "ldr q12, [x4, x13]\n"
+ "ldr q12, [x4, x17]\n"
"fmla v28.4s, v19.4s, v16.4s\n"
- "ldr q9, [x4, x6]\n"
- "ldr q4, [x8, #0x190]\n"
+ "ldr q9, [x4, x7]\n"
+ "ldr q4, [x6, #0x190]\n"
+ "add x6, x6, #0x1a0\n"
"fmax v30.4s, v30.4s, v27.4s\n"
"fmax v31.4s, v31.4s, v27.4s\n"
- "add x8, x8, #0x1a0\n"
"fmax v29.4s, v29.4s, v27.4s\n"
"fmax v28.4s, v28.4s, v27.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
- "st1 { v30.4s }, [x5]\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
+ "st1 { v30.4s }, [x5]\n"
"str q31, [x5, x3]\n"
"add x5, x5, #0x10\n"
"st1 { v29.4s }, [x10]\n"
@@ -340,163 +340,163 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"blt 2b\n"
"3:" // Tile loop: Channel tail
"mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
- "ldr q22, [x7, x15]\n"
+ "ldr q22, [x15, x8]\n"
"mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
"mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
- "ldr q19, [x8, #0x0]\n"
+ "ldr q19, [x6, #0x0]\n"
"fmla v31.4s, v1.4s, v6.4s\n"
- "ldr q21, [x7, x13]\n"
+ "ldr q21, [x15, x17]\n"
+ "add x15, x15, #0x10\n"
"fmla v5.4s, v1.4s, v9.4s\n"
- "add x7, x7, #0x10\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q18, [x8, #0x10]\n"
+ "ldr q18, [x6, #0x10]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q16, [x4, x11]\n"
- "fmla v5.4s, v2.4s, v11.4s\n"
+ "ldr q16, [x4, x16]\n"
"add x4, x4, #0x10\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v29.4s, v2.4s, v22.4s\n"
- "ldr q17, [x8, #0x20]\n"
+ "ldr q17, [x6, #0x20]\n"
"fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q6, [x17, x2]\n"
+ "ldr q6, [x14, x2]\n"
"fmla v5.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v22.4s\n"
"fmla v29.4s, v3.4s, v21.4s\n"
- "ldr q20, [x8, #0x30]\n"
+ "ldr q20, [x6, #0x30]\n"
"fmla v31.4s, v4.4s, v12.4s\n"
- "ldr q2, [x17, x6]\n"
+ "ldr q2, [x14, x7]\n"
"fmla v5.4s, v4.4s, v16.4s\n"
- "ldr q28, [x17, x15]\n"
+ "ldr q28, [x14, x8]\n"
"fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q16, [x8, #0x40]\n"
+ "ldr q16, [x6, #0x40]\n"
"fmla v31.4s, v19.4s, v7.4s\n"
"fmla v5.4s, v19.4s, v8.4s\n"
"fmla v30.4s, v19.4s, v14.4s\n"
"fmla v29.4s, v19.4s, v6.4s\n"
- "ldr q19, [x8, #0x50]\n"
+ "ldr q19, [x6, #0x50]\n"
"fmla v31.4s, v18.4s, v8.4s\n"
- "ldr q1, [x17, x11]\n"
+ "ldr q1, [x14, x16]\n"
"fmla v5.4s, v18.4s, v13.4s\n"
"fmla v30.4s, v18.4s, v6.4s\n"
"fmla v29.4s, v18.4s, v2.4s\n"
- "ldr q18, [x8, #0x60]\n"
+ "ldr q18, [x6, #0x60]\n"
"fmla v31.4s, v17.4s, v13.4s\n"
- "ldr q26, [x17, x13]\n"
+ "ldr q26, [x14, x17]\n"
+ "add x14, x14, #0x10\n"
"fmla v5.4s, v17.4s, v22.4s\n"
- "add x17, x17, #0x10\n"
"fmla v30.4s, v17.4s, v2.4s\n"
"fmla v29.4s, v17.4s, v28.4s\n"
- "ldr q17, [x8, #0x70]\n"
+ "ldr q17, [x6, #0x70]\n"
"fmla v31.4s, v20.4s, v22.4s\n"
- "ld1 { v25.4s }, [x16]\n"
+ "ld1 { v25.4s }, [x13]\n"
"fmla v5.4s, v20.4s, v21.4s\n"
"fmla v30.4s, v20.4s, v28.4s\n"
"fmla v29.4s, v20.4s, v26.4s\n"
- "ldr q24, [x8, #0x80]\n"
+ "ldr q24, [x6, #0x80]\n"
"fmla v31.4s, v16.4s, v21.4s\n"
- "ldr q23, [x16, x2]\n"
+ "ldr q23, [x13, x2]\n"
"fmla v5.4s, v16.4s, v10.4s\n"
- "ldr q0, [x16, x6]\n"
+ "ldr q0, [x13, x7]\n"
"fmla v30.4s, v16.4s, v26.4s\n"
"fmla v29.4s, v16.4s, v1.4s\n"
- "ldr q22, [x8, #0x90]\n"
+ "ldr q22, [x6, #0x90]\n"
"fmla v31.4s, v19.4s, v14.4s\n"
- "ldr q16, [x16, x11]\n"
+ "ldr q16, [x13, x16]\n"
"fmla v5.4s, v19.4s, v6.4s\n"
"fmla v30.4s, v19.4s, v25.4s\n"
"fmla v29.4s, v19.4s, v23.4s\n"
- "ldr q21, [x8, #0xa0]\n"
+ "ldr q21, [x6, #0xa0]\n"
"fmla v31.4s, v18.4s, v6.4s\n"
- "ldr q20, [x16, x15]\n"
+ "ldr q20, [x13, x8]\n"
"fmla v5.4s, v18.4s, v2.4s\n"
"fmla v30.4s, v18.4s, v23.4s\n"
"fmla v29.4s, v18.4s, v0.4s\n"
- "ldr q18, [x8, #0xb0]\n"
+ "ldr q18, [x6, #0xb0]\n"
"fmla v31.4s, v17.4s, v2.4s\n"
- "ldr q19, [x16, x13]\n"
+ "ldr q19, [x13, x17]\n"
+ "add x13, x13, #0x10\n"
"fmla v5.4s, v17.4s, v28.4s\n"
- "add x16, x16, #0x10\n"
"fmla v30.4s, v17.4s, v0.4s\n"
"fmla v29.4s, v17.4s, v20.4s\n"
- "ldr q17, [x8, #0xc0]\n"
+ "ldr q17, [x6, #0xc0]\n"
"fmla v31.4s, v24.4s, v28.4s\n"
- "ld1 { v7.4s }, [x14]\n"
+ "ld1 { v7.4s }, [x12]\n"
"fmla v5.4s, v24.4s, v26.4s\n"
"fmla v30.4s, v24.4s, v20.4s\n"
"fmla v29.4s, v24.4s, v19.4s\n"
- "ldr q2, [x8, #0xd0]\n"
+ "ldr q2, [x6, #0xd0]\n"
"fmla v31.4s, v22.4s, v26.4s\n"
- "ldr q28, [x14, x2]\n"
+ "ldr q28, [x12, x2]\n"
"fmla v5.4s, v22.4s, v1.4s\n"
- "ldr q13, [x14, x13]\n"
+ "ldr q13, [x12, x17]\n"
"fmla v30.4s, v22.4s, v19.4s\n"
"fmla v29.4s, v22.4s, v16.4s\n"
- "ldr q14, [x8, #0xe0]\n"
+ "ldr q14, [x6, #0xe0]\n"
"fmla v31.4s, v21.4s, v25.4s\n"
- "ldr q26, [x14, x6]\n"
+ "ldr q26, [x12, x7]\n"
"fmla v5.4s, v21.4s, v23.4s\n"
"fmla v30.4s, v21.4s, v7.4s\n"
"fmla v29.4s, v21.4s, v28.4s\n"
- "ldr q25, [x8, #0xf0]\n"
+ "ldr q25, [x6, #0xf0]\n"
"fmla v31.4s, v18.4s, v23.4s\n"
- "ldr q24, [x14, x15]\n"
+ "ldr q24, [x12, x8]\n"
"fmla v5.4s, v18.4s, v0.4s\n"
"fmla v30.4s, v18.4s, v28.4s\n"
"fmla v29.4s, v18.4s, v26.4s\n"
- "ldr q23, [x8, #0x100]\n"
+ "ldr q23, [x6, #0x100]\n"
"fmla v31.4s, v17.4s, v0.4s\n"
- "ldr q22, [x14, x11]\n"
+ "ldr q22, [x12, x16]\n"
+ "add x12, x12, #0x10\n"
"fmla v5.4s, v17.4s, v20.4s\n"
- "add x14, x14, #0x10\n"
"fmla v30.4s, v17.4s, v26.4s\n"
"fmla v29.4s, v17.4s, v24.4s\n"
- "ldr q21, [x8, #0x110]\n"
+ "ldr q21, [x6, #0x110]\n"
"fmla v31.4s, v2.4s, v20.4s\n"
- "ld1 { v18.4s }, [x12]\n"
+ "ld1 { v18.4s }, [x11]\n"
"fmla v5.4s, v2.4s, v19.4s\n"
"fmla v30.4s, v2.4s, v24.4s\n"
"fmla v29.4s, v2.4s, v13.4s\n"
- "ldr q20, [x8, #0x120]\n"
+ "ldr q20, [x6, #0x120]\n"
"fmla v31.4s, v14.4s, v19.4s\n"
- "ldr q17, [x12, x2]\n"
+ "ldr q17, [x11, x2]\n"
"fmla v5.4s, v14.4s, v16.4s\n"
"fmla v30.4s, v14.4s, v13.4s\n"
"fmla v29.4s, v14.4s, v22.4s\n"
- "ldr q19, [x8, #0x130]\n"
- "add x8, x8, #0x140\n"
+ "ldr q19, [x6, #0x130]\n"
+ "add x6, x6, #0x140\n"
"fmla v31.4s, v25.4s, v7.4s\n"
- "ldr q16, [x12, x6]\n"
+ "ldr q16, [x11, x7]\n"
"fmla v5.4s, v25.4s, v28.4s\n"
"fmla v30.4s, v25.4s, v18.4s\n"
- "ldr q18, [x12, x15]\n"
+ "ldr q18, [x11, x8]\n"
"fmla v29.4s, v25.4s, v17.4s\n"
"fmla v31.4s, v23.4s, v28.4s\n"
"fmla v5.4s, v23.4s, v26.4s\n"
"fmla v30.4s, v23.4s, v17.4s\n"
- "ldr q17, [x12, x13]\n"
+ "ldr q17, [x11, x17]\n"
"fmla v29.4s, v23.4s, v16.4s\n"
"fmla v31.4s, v21.4s, v26.4s\n"
"fmla v5.4s, v21.4s, v24.4s\n"
"fmla v30.4s, v21.4s, v16.4s\n"
- "ldr q16, [x12, x11]\n"
+ "ldr q16, [x11, x16]\n"
+ "add x11, x11, #0x10\n"
"fmla v29.4s, v21.4s, v18.4s\n"
- "add x12, x12, #0x10\n"
"fmla v31.4s, v20.4s, v24.4s\n"
"fmla v5.4s, v20.4s, v13.4s\n"
"fmla v30.4s, v20.4s, v18.4s\n"
"fmla v29.4s, v20.4s, v17.4s\n"
"fmla v31.4s, v19.4s, v13.4s\n"
"fmla v5.4s, v19.4s, v22.4s\n"
- "fmax v31.4s, v31.4s, v27.4s\n"
"fmla v30.4s, v19.4s, v17.4s\n"
"fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"fmax v5.4s, v5.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
"fmin v5.4s, v5.4s, v15.4s\n"
"st1 { v31.4s }, [x5]\n"
"fmin v30.4s, v30.4s, v15.4s\n"
@@ -509,23 +509,23 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"4:" // Tile loop: Oddments
"tst %x[n_channels], #0x3\n"
"beq 61f\n"
- "ldr q25, [x8, #0x0]\n"
- "ldr q0, [x8, #0x10]\n"
+ "ldr q25, [x6, #0x0]\n"
+ "ldr q0, [x6, #0x10]\n"
"add x9, x4, XZR\n"
"add x28, x4, x2\n"
- "ldr q1, [x8, #0x20]\n"
- "ldr q2, [x8, #0x30]\n"
- "add x27, x7, XZR\n"
- "add x26, x7, x2\n"
- "ldr q3, [x8, #0x40]\n"
- "ldr q4, [x8, #0x50]\n"
- "add x25, x4, x6\n"
- "add x24, x7, x6\n"
- "add x23, x4, x15\n"
- "add x22, x4, x13\n"
- "add x21, x7, x11\n"
- "add x20, x17, XZR\n"
- "add x8, x8, #0x60\n"
+ "ldr q1, [x6, #0x20]\n"
+ "ldr q2, [x6, #0x30]\n"
+ "add x27, x15, XZR\n"
+ "add x26, x15, x2\n"
+ "ldr q3, [x6, #0x40]\n"
+ "ldr q4, [x6, #0x50]\n"
+ "add x25, x4, x7\n"
+ "add x24, x15, x7\n"
+ "add x23, x4, x8\n"
+ "add x22, x4, x17\n"
+ "add x21, x15, x16\n"
+ "add x20, x14, XZR\n"
+ "add x6, x6, #0x60\n"
"tbz %x[n_channels], #1, 5f\n"
"ldr d5, [x9], #0x8\n"
"ldr d6, [x28], #0x8\n"
@@ -563,7 +563,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
"mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
- "add x20, x7, x15\n"
+ "add x20, x15, x8\n"
"mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
"fmla v28.4s, v1.4s, v6.4s\n"
@@ -583,7 +583,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
"fmla v31.4s, v2.4s, v5.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x7, x13\n"
+ "add x20, x15, x17\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v5.4s\n"
"tbz %x[n_channels], #1, 9f\n"
@@ -596,7 +596,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
"fmla v31.4s, v3.4s, v6.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x20, x4, x11\n"
+ "add x20, x4, x16\n"
"tbz %x[n_channels], #1, 11f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 12f\n"
@@ -605,13 +605,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
"ldr s9, [x20, #0x0]\n"
"12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v29.4s, v4.4s, v9.4s\n"
"fmla v30.4s, v4.4s, v6.4s\n"
- "add x20, x17, x2\n"
+ "add x20, x14, x2\n"
"fmla v31.4s, v4.4s, v10.4s\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v0.4s, v7.4s\n"
- "add x8, x8, #0x10\n"
"fmla v29.4s, v0.4s, v8.4s\n"
"fmla v30.4s, v0.4s, v14.4s\n"
"tbz %x[n_channels], #1, 13f\n"
@@ -622,13 +622,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.4s, v0.4s, v11.4s\n"
+ "add x20, x14, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v1.4s, v8.4s\n"
- "add x20, x17, x6\n"
"fmla v29.4s, v1.4s, v13.4s\n"
"fmla v30.4s, v1.4s, v11.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 15f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 16f\n"
@@ -637,13 +637,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
"ldr s12, [x20, #0x0]\n"
"16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.4s, v1.4s, v12.4s\n"
+ "add x20, x14, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v2.4s, v13.4s\n"
- "add x20, x17, x15\n"
"fmla v29.4s, v2.4s, v5.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 17f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 18f\n"
@@ -652,13 +652,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
"ldr s9, [x20, #0x0]\n"
"18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
+ "add x20, x14, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v3.4s, v5.4s\n"
- "add x20, x17, x13\n"
"fmla v29.4s, v3.4s, v6.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 19f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 20f\n"
@@ -667,13 +667,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
"ldr s13, [x20, #0x0]\n"
"20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.4s, v3.4s, v13.4s\n"
+ "add x20, x14, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v4.4s, v6.4s\n"
- "add x20, x17, x11\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v13.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 21f\n"
"ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #0, 22f\n"
@@ -682,12 +682,12 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
"ldr s8, [x20, #0x0]\n"
"22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.4s, v4.4s, v8.4s\n"
+ "add x20, x13, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v0.4s, v14.4s\n"
- "add x20, x16, XZR\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 23f\n"
"ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
@@ -697,7 +697,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s5, [x20, #0x0]\n"
"24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
"fmla v30.4s, v0.4s, v5.4s\n"
- "add x20, x16, x2\n"
+ "add x20, x13, x2\n"
"tbz %x[n_channels], #1, 25f\n"
"ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #0, 26f\n"
@@ -706,13 +706,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
"ldr s6, [x20, #0x0]\n"
"26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.4s, v0.4s, v6.4s\n"
+ "add x20, x13, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v1.4s, v11.4s\n"
- "add x20, x16, x6\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v30.4s, v1.4s, v6.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 27f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 28f\n"
@@ -721,13 +721,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
"ldr s10, [x20, #0x0]\n"
"28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.4s, v1.4s, v10.4s\n"
+ "add x20, x13, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v2.4s, v12.4s\n"
- "add x20, x16, x15\n"
"fmla v29.4s, v2.4s, v9.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 29f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 30f\n"
@@ -736,13 +736,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x13, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v3.4s, v9.4s\n"
- "add x20, x16, x13\n"
"fmla v29.4s, v3.4s, v13.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 31f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 32f\n"
@@ -751,13 +751,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
"ldr s12, [x20, #0x0]\n"
"32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
+ "add x20, x13, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v4.4s, v13.4s\n"
- "add x20, x16, x11\n"
"fmla v29.4s, v4.4s, v8.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 33f\n"
"ldr d14, [x20], #0x8\n"
"tbz %x[n_channels], #0, 34f\n"
@@ -766,12 +766,12 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
"ldr s14, [x20, #0x0]\n"
"34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.4s, v4.4s, v14.4s\n"
+ "add x20, x12, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v0.4s, v5.4s\n"
- "add x20, x14, XZR\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 35f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 36f\n"
@@ -781,7 +781,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s9, [x20, #0x0]\n"
"36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
"fmla v30.4s, v0.4s, v9.4s\n"
- "add x20, x14, x2\n"
+ "add x20, x12, x2\n"
"tbz %x[n_channels], #1, 37f\n"
"ldr d13, [x20], #0x8\n"
"tbz %x[n_channels], #0, 38f\n"
@@ -790,13 +790,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
"ldr s13, [x20, #0x0]\n"
"38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.4s, v0.4s, v13.4s\n"
+ "add x20, x12, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v1.4s, v6.4s\n"
- "add x20, x14, x6\n"
"fmla v29.4s, v1.4s, v10.4s\n"
"fmla v30.4s, v1.4s, v13.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 39f\n"
"ldr d5, [x20], #0x8\n"
"tbz %x[n_channels], #0, 40f\n"
@@ -805,13 +805,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
"ldr s5, [x20, #0x0]\n"
"40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
+ "add x20, x12, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v2.4s, v10.4s\n"
- "add x20, x14, x15\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v5.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 41f\n"
"ldr d6, [x20], #0x8\n"
"tbz %x[n_channels], #0, 42f\n"
@@ -820,13 +820,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
"ldr s6, [x20, #0x0]\n"
"42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.4s, v2.4s, v6.4s\n"
+ "add x20, x12, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x14, x13\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v6.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 43f\n"
"ldr d8, [x20], #0x8\n"
"tbz %x[n_channels], #0, 44f\n"
@@ -835,13 +835,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
"ldr s8, [x20, #0x0]\n"
"44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.4s, v3.4s, v8.4s\n"
+ "add x20, x12, x16\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x20, x14, x11\n"
"fmla v29.4s, v4.4s, v14.4s\n"
"fmla v30.4s, v4.4s, v8.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 45f\n"
"ldr d10, [x20], #0x8\n"
"tbz %x[n_channels], #0, 46f\n"
@@ -850,12 +850,12 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
"ldr s10, [x20, #0x0]\n"
"46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
- "ldr q0, [x8, #0x0]\n"
+ "ldr q0, [x6, #0x0]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
+ "add x20, x11, XZR\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v0.4s, v9.4s\n"
- "add x20, x12, XZR\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 47f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 48f\n"
@@ -865,7 +865,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"ldr s11, [x20, #0x0]\n"
"48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x20, x12, x2\n"
+ "add x20, x11, x2\n"
"tbz %x[n_channels], #1, 49f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 50f\n"
@@ -874,13 +874,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
"ldr s12, [x20, #0x0]\n"
"50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
- "ldr q1, [x8, #0x0]\n"
+ "ldr q1, [x6, #0x0]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
+ "add x20, x11, x7\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v1.4s, v13.4s\n"
- "add x20, x12, x6\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 51f\n"
"ldr d9, [x20], #0x8\n"
"tbz %x[n_channels], #0, 52f\n"
@@ -889,13 +889,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
"ldr s9, [x20, #0x0]\n"
"52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
- "ldr q2, [x8, #0x0]\n"
+ "ldr q2, [x6, #0x0]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
+ "add x20, x11, x8\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v2.4s, v5.4s\n"
- "add x20, x12, x15\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 53f\n"
"ldr d11, [x20], #0x8\n"
"tbz %x[n_channels], #0, 54f\n"
@@ -904,13 +904,13 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
"ldr s11, [x20, #0x0]\n"
"54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
- "ldr q3, [x8, #0x0]\n"
+ "ldr q3, [x6, #0x0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x11, x17\n"
+ "add x6, x6, #0x10\n"
"fmla v28.4s, v3.4s, v6.4s\n"
- "add x20, x12, x13\n"
"fmla v29.4s, v3.4s, v8.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x8, x8, #0x10\n"
"tbz %x[n_channels], #1, 55f\n"
"ldr d12, [x20], #0x8\n"
"tbz %x[n_channels], #0, 56f\n"
@@ -919,10 +919,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
"ldr s12, [x20, #0x0]\n"
"56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
- "ldr q4, [x8, #0x0]\n"
+ "ldr q4, [x6, #0x0]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
+ "add x20, x11, x16\n"
"fmla v28.4s, v4.4s, v8.4s\n"
- "add x20, x12, x11\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
"tbz %x[n_channels], #1, 57f\n"
@@ -937,18 +937,18 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"fmax v28.4s, v28.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
- "fmax v31.4s, v31.4s, v27.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 59f\n"
"mov x21, x5\n"
"mov x20, x10\n"
- "st1 { v28.d }[0], [x21], x3\n"
- "st1 { v30.d }[0], [x20], x3\n"
"add x5, x5, #0x8\n"
"add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
"st1 { v29.d }[0], [x21]\n"
"st1 { v31.d }[0], [x20]\n"
"tbz %x[n_channels], #0, 60f\n"
@@ -968,16 +968,16 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"st1 { v31.s }[0], [x20]\n"
"60:" // Tile loop: Oddments: Store: Bit 1: End
"61:" // Tile loop: End
- "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x26, x26, #0x1\n"
- "add x21, x27, #0x1\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x26, x20\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x27, x27, x21, LT\n"
- "csel x26, x26, XZR, LT\n"
- "cmp x27, x20\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x10, x10, #0x1\n"
+ "add x20, x11, #0x1\n"
+ "cmp x10, x22\n"
+ "csel x11, x11, x20, LT\n"
+ "csel x10, x10, XZR, LT\n"
+ "cmp x11, x21\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 32939eb6dc..d50b396261 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -98,478 +98,478 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "mov x17, #0x10\n" // cntb _, ALL, #1
- "lsr x9, %x[n_channels], #0x2\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x8, #0x10\n" // cntb _, ALL, #1
+ "lsr x17, %x[n_channels], #0x2\n"
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x20, %x[params_struct], %[offsetof_args_min]\n"
- "ld1r { v27.4s }, [x20]\n"
+ "add x21, %x[params_struct], %[offsetof_args_min]\n"
"add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.4s }, [x21]\n"
"ld1r { v15.4s }, [x20]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "ldp x12, x11, [x21, #0x10]\n"
- "mov x10, #0x0\n"
- "sub x28, XZR, x17\n"
- "cbz x9, 3f\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x22, #0x0]\n"
+ "ldp x11, x10, [x22, #0x10]\n"
+ "sub x9, XZR, x8\n"
+ "cbz x17, 3f\n"
"ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "cmp x17, x9, LSL #4\n"
+ "cmp x8, x17, LSL #4\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
"add x16, x16, #0x60\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "ldr q5, [x21, x10]\n"
- "ldr q6, [x20, x10]\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q7, [x21, x10]\n"
- "ldr q8, [x20, x10]\n"
- "ldp x21, x20, [x15, #0x20]\n"
- "ldr q9, [x21, x10]\n"
- "ldr q13, [x20, x10]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
"ldp x21, x20, [x15, #0x30]\n"
- "ldr q11, [x21, x10]\n"
- "ldr q12, [x20, x10]\n"
+ "ldr q5, [x27, x14]\n"
+ "ldr q6, [x26, x14]\n"
+ "ldr q7, [x25, x14]\n"
+ "ldr q8, [x24, x14]\n"
+ "ldr q9, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
"ldp x21, x20, [x15, #0x40]\n"
- "ldr q10, [x21, x10]\n"
- "ldr q14, [x20, x10]\n"
+ "ldr q10, [x21, x14]\n"
+ "ldr q14, [x20, x14]\n"
"bge 2f\n"
"1:" // Channel loop
"mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
"mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr q24, [x20, x10]\n"
- "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
- "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr x21, [x15, #0x50]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v8.4s\n"
"ldr q23, [x16, #0x0]\n"
"ldr q26, [x16, #0x140]\n"
+ "ldr x22, [x15, #0x60]\n"
+ "ldr x25, [x15, #0x68]\n"
+ "add x9, x9, #0x10\n"
+ "ldr q22, [x21, x14]\n"
+ "ldr x24, [x15, #0x70]\n"
"fmla v30.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x20, x14]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr q22, [x20, x10]\n"
- "fmla v28.4s, v1.4s, v8.4s\n"
- "fmla v29.4s, v1.4s, v13.4s\n"
- "ldr q21, [x16, #0x10]\n"
- "ldr x20, [x15, #0x60]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "ldr x27, [x15, #0x80]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "ldr x23, [x15, #0x90]\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "ldr q17, [x20, x10]\n"
+ "ldr q18, [x22, x14]\n"
+ "ldr x26, [x15, #0x98]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
- "ldr x20, [x15, #0x68]\n"
- "fmla v28.4s, v2.4s, v13.4s\n"
- "fmla v29.4s, v2.4s, v24.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v28.4s, v2.4s, v22.4s\n"
"ldr q16, [x16, #0x20]\n"
- "ldr x22, [x15, #0x70]\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "ldr q5, [x20, x10]\n"
+ "ldr q20, [x25, x14]\n"
+ "ldr x25, [x15, #0xa8]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla v28.4s, v3.4s, v24.4s\n"
"fmla v29.4s, v3.4s, v22.4s\n"
- "ldr q20, [x16, #0x30]\n"
- "ldr x21, [x15, #0x80]\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "ldr q17, [x16, #0x30]\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "ldr q19, [x22, x10]\n"
- "fmla v31.4s, v4.4s, v17.4s\n"
- "ldr q2, [x20, x10]\n"
- "fmla v28.4s, v4.4s, v22.4s\n"
- "fmla v29.4s, v4.4s, v10.4s\n"
- "ldr q18, [x16, #0x40]\n"
- "ldr x20, [x15, #0x88]\n"
+ "ldr q3, [x24, x14]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v31.4s, v4.4s, v18.4s\n"
+ "ldr q2, [x21, x14]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q19, [x16, #0x40]\n"
"fmla v30.4s, v23.4s, v7.4s\n"
"fmla v31.4s, v23.4s, v8.4s\n"
- "ldr x23, [x15, #0x90]\n"
- "ldr x26, [x15, #0x98]\n"
- "fmla v28.4s, v23.4s, v14.4s\n"
- "fmla v29.4s, v23.4s, v5.4s\n"
- "ldr q1, [x16, #0x50]\n"
- "ldr x22, [x15, #0xa0]\n"
- "fmla v30.4s, v21.4s, v8.4s\n"
- "ldr q25, [x20, x10]\n"
- "fmla v31.4s, v21.4s, v13.4s\n"
- "ldr x25, [x15, #0xa8]\n"
- "fmla v28.4s, v21.4s, v5.4s\n"
- "fmla v29.4s, v21.4s, v19.4s\n"
- "ldr q17, [x16, #0x60]\n"
- "ldr x24, [x15, #0xb0]\n"
+ "fmla v29.4s, v23.4s, v14.4s\n"
+ "fmla v28.4s, v23.4s, v20.4s\n"
+ "ldr q18, [x16, #0x50]\n"
+ "fmla v30.4s, v0.4s, v8.4s\n"
+ "ldr q25, [x20, x14]\n"
+ "ldr x28, [x15, #0xc8]\n"
+ "fmla v31.4s, v0.4s, v13.4s\n"
+ "fmla v29.4s, v0.4s, v20.4s\n"
+ "fmla v28.4s, v0.4s, v3.4s\n"
+ "ldr q11, [x16, #0x60]\n"
"fmla v30.4s, v16.4s, v13.4s\n"
- "ldr q8, [x21, x10]\n"
- "fmla v31.4s, v16.4s, v24.4s\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla v28.4s, v16.4s, v19.4s\n"
- "fmla v29.4s, v16.4s, v2.4s\n"
+ "ldr q24, [x27, x14]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v31.4s, v16.4s, v22.4s\n"
+ "fmla v29.4s, v16.4s, v3.4s\n"
+ "fmla v28.4s, v16.4s, v2.4s\n"
"ldr q16, [x16, #0x70]\n"
- "ldr x21, [x15, #0xc0]\n"
- "fmla v30.4s, v20.4s, v24.4s\n"
- "ldr q24, [x23, x10]\n"
- "fmla v31.4s, v20.4s, v22.4s\n"
- "ldr x27, [x15, #0xc8]\n"
- "fmla v28.4s, v20.4s, v2.4s\n"
- "fmla v29.4s, v20.4s, v8.4s\n"
- "ldr q23, [x16, #0x80]\n"
+ "fmla v30.4s, v17.4s, v22.4s\n"
+ "ldr q5, [x23, x14]\n"
"ldr x23, [x15, #0xd0]\n"
- "fmla v30.4s, v18.4s, v22.4s\n"
- "ldr q22, [x26, x10]\n"
- "fmla v31.4s, v18.4s, v10.4s\n"
- "ldr q21, [x22, x10]\n"
- "fmla v28.4s, v18.4s, v8.4s\n"
- "fmla v29.4s, v18.4s, v25.4s\n"
- "ldr q20, [x16, #0x90]\n"
- "ldr x22, [x15, #0xd8]\n"
- "fmla v30.4s, v1.4s, v14.4s\n"
- "ldr q0, [x20, x10]\n"
- "fmla v31.4s, v1.4s, v5.4s\n"
- "ldr x20, [x15, #0xe0]\n"
- "fmla v28.4s, v1.4s, v24.4s\n"
- "fmla v29.4s, v1.4s, v22.4s\n"
- "ldr q6, [x16, #0xa0]\n"
+ "fmla v31.4s, v17.4s, v21.4s\n"
+ "fmla v29.4s, v17.4s, v2.4s\n"
+ "fmla v28.4s, v17.4s, v24.4s\n"
+ "ldr q17, [x16, #0x80]\n"
+ "fmla v30.4s, v19.4s, v21.4s\n"
+ "ldr q23, [x26, x14]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.4s, v19.4s, v10.4s\n"
+ "ldr q22, [x22, x14]\n"
+ "ldr x22, [x15, #0xe0]\n"
+ "fmla v29.4s, v19.4s, v24.4s\n"
+ "fmla v28.4s, v19.4s, v25.4s\n"
+ "ldr q21, [x16, #0x90]\n"
+ "fmla v30.4s, v18.4s, v14.4s\n"
+ "ldr q1, [x21, x14]\n"
"ldr x26, [x15, #0xf8]\n"
- "fmla v30.4s, v17.4s, v5.4s\n"
- "ldr q1, [x25, x10]\n"
- "fmla v31.4s, v17.4s, v19.4s\n"
+ "fmla v31.4s, v18.4s, v20.4s\n"
+ "fmla v29.4s, v18.4s, v5.4s\n"
+ "fmla v28.4s, v18.4s, v23.4s\n"
+ "ldr q12, [x16, #0xa0]\n"
+ "fmla v30.4s, v11.4s, v20.4s\n"
+ "ldr q0, [x25, x14]\n"
"ldr x25, [x15, #0xe8]\n"
- "fmla v28.4s, v17.4s, v22.4s\n"
- "fmla v29.4s, v17.4s, v21.4s\n"
- "ldr q18, [x16, #0xb0]\n"
- "add x28, x28, #0x10\n"
- "fmla v30.4s, v16.4s, v19.4s\n"
- "ldr q19, [x24, x10]\n"
- "fmla v31.4s, v16.4s, v2.4s\n"
+ "fmla v31.4s, v11.4s, v3.4s\n"
+ "fmla v29.4s, v11.4s, v23.4s\n"
+ "fmla v28.4s, v11.4s, v22.4s\n"
+ "ldr q20, [x16, #0xb0]\n"
+ "fmla v30.4s, v16.4s, v3.4s\n"
+ "ldr q19, [x24, x14]\n"
"ldr x24, [x15, #0xf0]\n"
- "fmla v28.4s, v16.4s, v21.4s\n"
- "fmla v29.4s, v16.4s, v1.4s\n"
- "ldr q17, [x16, #0xc0]\n"
- "fmla v30.4s, v23.4s, v2.4s\n"
- "ldr q16, [x21, x10]\n"
- "fmla v31.4s, v23.4s, v8.4s\n"
+ "fmla v31.4s, v16.4s, v2.4s\n"
+ "fmla v29.4s, v16.4s, v22.4s\n"
+ "fmla v28.4s, v16.4s, v0.4s\n"
+ "ldr q18, [x16, #0xc0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "ldr q16, [x20, x14]\n"
"ldr x21, [x15, #0x100]\n"
- "fmla v28.4s, v23.4s, v1.4s\n"
- "fmla v29.4s, v23.4s, v19.4s\n"
- "ldr q13, [x16, #0xd0]\n"
- "fmla v30.4s, v20.4s, v8.4s\n"
- "ldr q2, [x27, x10]\n"
- "fmla v31.4s, v20.4s, v25.4s\n"
- "ldr q10, [x20, x10]\n"
- "fmla v28.4s, v20.4s, v19.4s\n"
- "fmla v29.4s, v20.4s, v0.4s\n"
- "ldr q9, [x16, #0xe0]\n"
+ "fmla v31.4s, v17.4s, v24.4s\n"
+ "fmla v29.4s, v17.4s, v0.4s\n"
+ "fmla v28.4s, v17.4s, v19.4s\n"
+ "ldr q17, [x16, #0xd0]\n"
+ "fmla v30.4s, v21.4s, v24.4s\n"
+ "ldr q14, [x28, x14]\n"
"ldr x20, [x15, #0x108]\n"
- "fmla v30.4s, v6.4s, v24.4s\n"
- "ldr q5, [x23, x10]\n"
- "fmla v31.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q4, [x22, x14]\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "fmla v28.4s, v21.4s, v1.4s\n"
+ "ldr q7, [x16, #0xe0]\n"
+ "fmla v30.4s, v12.4s, v5.4s\n"
+ "ldr q25, [x23, x14]\n"
"ldr x23, [x15, #0x110]\n"
- "fmla v28.4s, v6.4s, v16.4s\n"
- "fmla v29.4s, v6.4s, v2.4s\n"
- "ldr q24, [x16, #0xf0]\n"
- "fmla v30.4s, v18.4s, v22.4s\n"
- "ldr q25, [x22, x10]\n"
- "fmla v31.4s, v18.4s, v21.4s\n"
+ "fmla v31.4s, v12.4s, v23.4s\n"
+ "fmla v29.4s, v12.4s, v16.4s\n"
+ "fmla v28.4s, v12.4s, v14.4s\n"
+ "ldr q11, [x16, #0xf0]\n"
+ "fmla v30.4s, v20.4s, v23.4s\n"
+ "ldr q24, [x27, x14]\n"
"ldr x22, [x15, #0x118]\n"
- "fmla v28.4s, v18.4s, v2.4s\n"
- "fmla v29.4s, v18.4s, v5.4s\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "fmla v29.4s, v20.4s, v14.4s\n"
+ "fmla v28.4s, v20.4s, v25.4s\n"
"ldr q23, [x16, #0x100]\n"
- "fmla v30.4s, v17.4s, v21.4s\n"
- "ldr q22, [x25, x10]\n"
- "fmla v31.4s, v17.4s, v1.4s\n"
- "fmla v28.4s, v17.4s, v5.4s\n"
- "fmla v29.4s, v17.4s, v25.4s\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q22, [x25, x14]\n"
+ "fmla v31.4s, v18.4s, v0.4s\n"
+ "fmla v29.4s, v18.4s, v25.4s\n"
+ "fmla v28.4s, v18.4s, v24.4s\n"
"ldr q21, [x16, #0x110]\n"
- "fmla v30.4s, v13.4s, v1.4s\n"
- "ldr q18, [x24, x10]\n"
- "fmla v31.4s, v13.4s, v19.4s\n"
- "fmla v28.4s, v13.4s, v25.4s\n"
- "fmla v29.4s, v13.4s, v10.4s\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v31.4s, v17.4s, v19.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "fmla v28.4s, v17.4s, v4.4s\n"
"ldr q20, [x16, #0x120]\n"
- "fmla v30.4s, v9.4s, v19.4s\n"
- "ldr q17, [x26, x10]\n"
- "fmla v31.4s, v9.4s, v0.4s\n"
- "fmla v28.4s, v9.4s, v10.4s\n"
- "fmla v29.4s, v9.4s, v22.4s\n"
+ "fmla v30.4s, v7.4s, v19.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v31.4s, v7.4s, v1.4s\n"
+ "fmla v29.4s, v7.4s, v4.4s\n"
+ "fmla v28.4s, v7.4s, v22.4s\n"
"ldr q19, [x16, #0x130]\n"
- "fmla v30.4s, v24.4s, v16.4s\n"
- "ldr q16, [x21, x10]\n"
- "fmla v31.4s, v24.4s, v2.4s\n"
- "fmla v28.4s, v24.4s, v18.4s\n"
- "ldr q18, [x20, x10]\n"
- "fmla v29.4s, v24.4s, v17.4s\n"
+ "fmla v30.4s, v11.4s, v16.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v31.4s, v11.4s, v14.4s\n"
+ "fmla v29.4s, v11.4s, v18.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "ldp x20, x21, [x15, #0x0]\n"
+ "fmla v28.4s, v11.4s, v17.4s\n"
"ldr q0, [x16, #0x150]\n"
- "fmla v30.4s, v23.4s, v2.4s\n"
- "fmla v31.4s, v23.4s, v5.4s\n"
- "ldp x21, x20, [x15, #0x0]\n"
- "fmla v28.4s, v23.4s, v17.4s\n"
- "ldr q17, [x23, x10]\n"
- "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v30.4s, v23.4s, v14.4s\n"
+ "fmla v31.4s, v23.4s, v25.4s\n"
+ "fmla v29.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x23, x14]\n"
+ "fmla v28.4s, v23.4s, v16.4s\n"
"ldr q1, [x16, #0x160]\n"
- "fmla v30.4s, v21.4s, v5.4s\n"
- "ldr q5, [x21, x17]\n"
- "fmla v31.4s, v21.4s, v25.4s\n"
- "fmla v28.4s, v21.4s, v16.4s\n"
- "ldr q16, [x22, x10]\n"
- "fmla v29.4s, v21.4s, v18.4s\n"
+ "fmla v30.4s, v21.4s, v25.4s\n"
+ "ldr q5, [x20, x8]\n"
+ "fmla v31.4s, v21.4s, v24.4s\n"
+ "fmla v29.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "ldp x20, x26, [x15, #0x10]\n"
+ "ldp x25, x24, [x15, #0x20]\n"
+ "ldp x23, x22, [x15, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "ldr q7, [x20, x8]\n"
+ "fmla v28.4s, v21.4s, v18.4s\n"
"ldr q2, [x16, #0x170]\n"
- "fmla v30.4s, v20.4s, v25.4s\n"
- "ldr q6, [x20, x17]\n"
- "fmla v31.4s, v20.4s, v10.4s\n"
- "ldp x21, x20, [x15, #0x10]\n"
- "ldr q7, [x21, x17]\n"
- "fmla v28.4s, v20.4s, v18.4s\n"
- "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v30.4s, v20.4s, v24.4s\n"
+ "ldr q6, [x21, x8]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q13, [x24, x8]\n"
+ "fmla v31.4s, v20.4s, v4.4s\n"
+ "fmla v29.4s, v20.4s, v18.4s\n"
+ "ldr q11, [x23, x8]\n"
+ "ldr q14, [x20, x8]\n"
+ "fmla v28.4s, v20.4s, v17.4s\n"
"ldr q3, [x16, #0x180]\n"
- "fmla v30.4s, v19.4s, v10.4s\n"
- "ldr q8, [x20, x17]\n"
+ "fmla v30.4s, v19.4s, v4.4s\n"
+ "ldr q8, [x26, x8]\n"
"fmla v31.4s, v19.4s, v22.4s\n"
- "ldp x21, x20, [x15, #0x20]\n"
- "ldr q13, [x20, x17]\n"
- "fmla v28.4s, v19.4s, v17.4s\n"
- "fmla v29.4s, v19.4s, v16.4s\n"
- "ldr q9, [x21, x17]\n"
+ "ldr q10, [x21, x8]\n"
+ "fmla v29.4s, v19.4s, v17.4s\n"
+ "ldr q12, [x22, x8]\n"
+ "fmla v28.4s, v19.4s, v16.4s\n"
+ "ldr q9, [x25, x8]\n"
+ "add x8, x8, #0x10\n"
"ldr q4, [x16, #0x190]\n"
- "ldp x21, x20, [x15, #0x30]\n"
+ "cmp x8, x17, LSL #4\n"
+ "add x16, x16, #0x1a0\n"
"fmax v30.4s, v30.4s, v27.4s\n"
"fmax v31.4s, v31.4s, v27.4s\n"
- "ldr q11, [x21, x17]\n"
- "ldr q12, [x20, x17]\n"
- "fmax v28.4s, v28.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
- "ldp x21, x20, [x15, #0x40]\n"
- "ldr q10, [x21, x17]\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
- "ldr q14, [x20, x17]\n"
- "add x17, x17, #0x10\n"
- "cmp x17, x9, LSL #4\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
- "add x10, x10, #0x10\n"
- "str q30, [x14, x28]\n"
- "add x16, x16, #0x1a0\n"
- "str q31, [x13, x28]\n"
- "str q28, [x12, x28]\n"
- "str q29, [x11, x28]\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q30, [x13, x9]\n"
+ "str q31, [x12, x9]\n"
+ "str q29, [x11, x9]\n"
+ "str q28, [x10, x9]\n"
"blt 1b\n"
"2:" // Channel tail
"mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
"mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
- "ldr x20, [x15, #0x50]\n"
- "ldr q22, [x20, x10]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "ldr x21, [x15, #0x58]\n"
"mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
"ldr q19, [x16, #0x0]\n"
- "ldr x20, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x25, [x15, #0x68]\n"
+ "ldr x24, [x15, #0x70]\n"
+ "add x9, x9, #0x10\n"
+ "ldr q22, [x22, x14]\n"
+ "ldr x23, [x15, #0x78]\n"
"fmla v31.4s, v1.4s, v6.4s\n"
- "ldr q21, [x20, x10]\n"
+ "ldr q21, [x21, x14]\n"
"fmla v5.4s, v1.4s, v9.4s\n"
- "ldr x21, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x80]\n"
"fmla v30.4s, v1.4s, v8.4s\n"
"fmla v29.4s, v1.4s, v13.4s\n"
"ldr q18, [x16, #0x10]\n"
- "ldr x20, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
- "ldr q16, [x21, x10]\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0xa0]\n"
"fmla v5.4s, v2.4s, v11.4s\n"
- "ldr x23, [x15, #0x70]\n"
"fmla v30.4s, v2.4s, v13.4s\n"
"fmla v29.4s, v2.4s, v22.4s\n"
"ldr q17, [x16, #0x20]\n"
- "ldr x21, [x15, #0x78]\n"
"fmla v31.4s, v3.4s, v11.4s\n"
- "ldr q6, [x20, x10]\n"
+ "ldr q6, [x25, x14]\n"
+ "ldr x25, [x15, #0xa8]\n"
"fmla v5.4s, v3.4s, v12.4s\n"
- "ldr x22, [x15, #0x80]\n"
"fmla v30.4s, v3.4s, v22.4s\n"
"fmla v29.4s, v3.4s, v21.4s\n"
"ldr q20, [x16, #0x30]\n"
- "ldr x20, [x15, #0x88]\n"
"fmla v31.4s, v4.4s, v12.4s\n"
- "ldr q2, [x23, x10]\n"
+ "ldr q2, [x24, x14]\n"
+ "ldr x24, [x15, #0xb0]\n"
"fmla v5.4s, v4.4s, v16.4s\n"
- "ldr q28, [x21, x10]\n"
+ "ldr q28, [x23, x14]\n"
+ "ldr x23, [x15, #0xb8]\n"
"fmla v30.4s, v4.4s, v21.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"ldr q16, [x16, #0x40]\n"
- "ldr x21, [x15, #0x90]\n"
"fmla v31.4s, v19.4s, v7.4s\n"
"fmla v5.4s, v19.4s, v8.4s\n"
- "ldr x27, [x15, #0x98]\n"
- "ldr x26, [x15, #0xa0]\n"
"fmla v30.4s, v19.4s, v14.4s\n"
"fmla v29.4s, v19.4s, v6.4s\n"
"ldr q19, [x16, #0x50]\n"
- "ldr x25, [x15, #0xa8]\n"
"fmla v31.4s, v18.4s, v8.4s\n"
- "ldr q1, [x20, x10]\n"
+ "ldr q1, [x22, x14]\n"
+ "ldr x28, [x15, #0xc8]\n"
"fmla v5.4s, v18.4s, v13.4s\n"
- "ldr x24, [x15, #0xb0]\n"
"fmla v30.4s, v18.4s, v6.4s\n"
"fmla v29.4s, v18.4s, v2.4s\n"
"ldr q18, [x16, #0x60]\n"
- "ldr x20, [x15, #0xb8]\n"
"fmla v31.4s, v17.4s, v13.4s\n"
- "ldr q26, [x22, x10]\n"
+ "ldr q26, [x27, x14]\n"
+ "ldr x22, [x15, #0xc0]\n"
"fmla v5.4s, v17.4s, v22.4s\n"
- "ldr x23, [x15, #0xc0]\n"
"fmla v30.4s, v17.4s, v2.4s\n"
"fmla v29.4s, v17.4s, v28.4s\n"
"ldr q17, [x16, #0x70]\n"
- "ldr x22, [x15, #0xc8]\n"
"fmla v31.4s, v20.4s, v22.4s\n"
- "ldr q25, [x21, x10]\n"
- "fmla v5.4s, v20.4s, v21.4s\n"
+ "ldr q25, [x21, x14]\n"
"ldr x21, [x15, #0xd0]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
"fmla v30.4s, v20.4s, v28.4s\n"
"fmla v29.4s, v20.4s, v26.4s\n"
"ldr q24, [x16, #0x80]\n"
- "add x28, x28, #0x10\n"
"fmla v31.4s, v16.4s, v21.4s\n"
- "ldr q23, [x27, x10]\n"
+ "ldr q23, [x26, x14]\n"
+ "ldr x27, [x15, #0xd8]\n"
"fmla v5.4s, v16.4s, v10.4s\n"
- "ldr q0, [x26, x10]\n"
+ "ldr q0, [x20, x14]\n"
+ "ldr x20, [x15, #0xe0]\n"
"fmla v30.4s, v16.4s, v26.4s\n"
"fmla v29.4s, v16.4s, v1.4s\n"
"ldr q22, [x16, #0x90]\n"
- "ldr x27, [x15, #0xd8]\n"
"fmla v31.4s, v19.4s, v14.4s\n"
- "ldr q16, [x20, x10]\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0xf8]\n"
"fmla v5.4s, v19.4s, v6.4s\n"
- "ldr x20, [x15, #0xe0]\n"
"fmla v30.4s, v19.4s, v25.4s\n"
"fmla v29.4s, v19.4s, v23.4s\n"
"ldr q21, [x16, #0xa0]\n"
- "ldr x26, [x15, #0xf8]\n"
"fmla v31.4s, v18.4s, v6.4s\n"
- "ldr q20, [x25, x10]\n"
- "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr q20, [x25, x14]\n"
"ldr x25, [x15, #0xe8]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
"fmla v30.4s, v18.4s, v23.4s\n"
"fmla v29.4s, v18.4s, v0.4s\n"
"ldr q18, [x16, #0xb0]\n"
"fmla v31.4s, v17.4s, v2.4s\n"
- "ldr q19, [x24, x10]\n"
- "fmla v5.4s, v17.4s, v28.4s\n"
+ "ldr q19, [x24, x14]\n"
"ldr x24, [x15, #0xf0]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
"fmla v30.4s, v17.4s, v0.4s\n"
"fmla v29.4s, v17.4s, v20.4s\n"
"ldr q17, [x16, #0xc0]\n"
"fmla v31.4s, v24.4s, v28.4s\n"
- "ldr q7, [x23, x10]\n"
- "fmla v5.4s, v24.4s, v26.4s\n"
+ "ldr q10, [x22, x14]\n"
"ldr x23, [x15, #0x100]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
"fmla v30.4s, v24.4s, v20.4s\n"
"fmla v29.4s, v24.4s, v19.4s\n"
- "ldr q3, [x16, #0xd0]\n"
+ "ldr q13, [x16, #0xd0]\n"
"fmla v31.4s, v22.4s, v26.4s\n"
- "ldr q28, [x22, x10]\n"
+ "ldr q28, [x28, x14]\n"
+ "ldr x22, [x15, #0x108]\n"
"fmla v5.4s, v22.4s, v1.4s\n"
- "ldr q13, [x20, x10]\n"
+ "ldr q14, [x20, x14]\n"
"fmla v30.4s, v22.4s, v19.4s\n"
"fmla v29.4s, v22.4s, v16.4s\n"
- "ldr q11, [x16, #0xe0]\n"
- "ldr x22, [x15, #0x108]\n"
+ "ldr q12, [x16, #0xe0]\n"
"fmla v31.4s, v21.4s, v25.4s\n"
- "ldr q26, [x21, x10]\n"
- "fmla v5.4s, v21.4s, v23.4s\n"
+ "ldr q26, [x21, x14]\n"
"ldr x21, [x15, #0x110]\n"
- "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "fmla v30.4s, v21.4s, v10.4s\n"
"fmla v29.4s, v21.4s, v28.4s\n"
"ldr q25, [x16, #0xf0]\n"
"fmla v31.4s, v18.4s, v23.4s\n"
- "ldr q24, [x27, x10]\n"
- "fmla v5.4s, v18.4s, v0.4s\n"
+ "ldr q24, [x27, x14]\n"
"ldr x20, [x15, #0x118]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
"fmla v30.4s, v18.4s, v28.4s\n"
"fmla v29.4s, v18.4s, v26.4s\n"
"ldr q23, [x16, #0x100]\n"
"fmla v31.4s, v17.4s, v0.4s\n"
- "ldr q22, [x25, x10]\n"
+ "ldr q22, [x25, x14]\n"
"fmla v5.4s, v17.4s, v20.4s\n"
"fmla v30.4s, v17.4s, v26.4s\n"
"fmla v29.4s, v17.4s, v24.4s\n"
"ldr q21, [x16, #0x110]\n"
- "fmla v31.4s, v3.4s, v20.4s\n"
- "ldr q18, [x24, x10]\n"
- "fmla v5.4s, v3.4s, v19.4s\n"
- "fmla v30.4s, v3.4s, v24.4s\n"
- "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v31.4s, v13.4s, v20.4s\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v5.4s, v13.4s, v19.4s\n"
+ "fmla v30.4s, v13.4s, v24.4s\n"
+ "fmla v29.4s, v13.4s, v14.4s\n"
"ldr q20, [x16, #0x120]\n"
- "fmla v31.4s, v11.4s, v19.4s\n"
- "ldr q17, [x26, x10]\n"
- "fmla v5.4s, v11.4s, v16.4s\n"
- "fmla v30.4s, v11.4s, v13.4s\n"
- "fmla v29.4s, v11.4s, v22.4s\n"
+ "fmla v31.4s, v12.4s, v19.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "fmla v5.4s, v12.4s, v16.4s\n"
+ "fmla v30.4s, v12.4s, v14.4s\n"
+ "fmla v29.4s, v12.4s, v22.4s\n"
"ldr q19, [x16, #0x130]\n"
"add x16, x16, #0x140\n"
- "fmla v31.4s, v25.4s, v7.4s\n"
- "ldr q16, [x23, x10]\n"
+ "fmla v31.4s, v25.4s, v10.4s\n"
+ "ldr q16, [x23, x14]\n"
"fmla v5.4s, v25.4s, v28.4s\n"
"fmla v30.4s, v25.4s, v18.4s\n"
- "ldr q18, [x22, x10]\n"
+ "ldr q18, [x22, x14]\n"
"fmla v29.4s, v25.4s, v17.4s\n"
"fmla v31.4s, v23.4s, v28.4s\n"
"fmla v5.4s, v23.4s, v26.4s\n"
"fmla v30.4s, v23.4s, v17.4s\n"
- "ldr q17, [x21, x10]\n"
+ "ldr q17, [x21, x14]\n"
"fmla v29.4s, v23.4s, v16.4s\n"
"fmla v31.4s, v21.4s, v26.4s\n"
"fmla v5.4s, v21.4s, v24.4s\n"
"fmla v30.4s, v21.4s, v16.4s\n"
- "ldr q16, [x20, x10]\n"
+ "ldr q16, [x20, x14]\n"
+ "add x14, x14, #0x10\n"
"fmla v29.4s, v21.4s, v18.4s\n"
- "add x10, x10, #0x10\n"
"fmla v31.4s, v20.4s, v24.4s\n"
- "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v5.4s, v20.4s, v14.4s\n"
"fmla v30.4s, v20.4s, v18.4s\n"
"fmla v29.4s, v20.4s, v17.4s\n"
- "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
"fmla v5.4s, v19.4s, v22.4s\n"
- "fmax v31.4s, v31.4s, v27.4s\n"
"fmla v30.4s, v19.4s, v17.4s\n"
"fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"fmax v5.4s, v5.4s, v27.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
- "fmax v29.4s, v29.4s, v27.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
"fmin v5.4s, v5.4s, v15.4s\n"
- "str q31, [x14, x28]\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
- "str q5, [x13, x28]\n"
- "str q30, [x12, x28]\n"
- "str q29, [x11, x28]\n"
+ "str q31, [x13, x9]\n"
+ "str q5, [x12, x9]\n"
+ "str q30, [x11, x9]\n"
+ "str q29, [x10, x9]\n"
"3:" // Oddments
"tst %x[n_channels], #0x3\n"
"beq 60f\n"
"ldr q26, [x16, #0x0]\n"
"ldr q0, [x16, #0x10]\n"
- "mov x20, x10\n"
- "add x14, x14, x20\n"
+ "mov x20, x14\n"
"ldr q1, [x16, #0x20]\n"
"ldr q2, [x16, #0x30]\n"
- "add x13, x13, x20\n"
- "add x12, x12, x20\n"
"ldr q3, [x16, #0x40]\n"
"ldr q4, [x16, #0x50]\n"
- "add x11, x11, x20\n"
"ldr x9, [x15, #0x0]\n"
"ldr x28, [x15, #0x8]\n"
- "add x9, x9, x10\n"
- "add x28, x28, x10\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
"ldr x27, [x15, #0x10]\n"
"ldr x26, [x15, #0x18]\n"
- "add x27, x27, x10\n"
- "add x26, x26, x10\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
"ldr x25, [x15, #0x20]\n"
"ldr x24, [x15, #0x28]\n"
- "add x25, x25, x10\n"
- "add x24, x24, x10\n"
"ldr x23, [x15, #0x30]\n"
"ldr x22, [x15, #0x38]\n"
- "add x23, x23, x10\n"
- "add x22, x22, x10\n"
+ "add x9, x9, x14\n"
+ "add x28, x28, x14\n"
"ldr x21, [x15, #0x40]\n"
"ldr x20, [x15, #0x48]\n"
- "add x21, x21, x10\n"
- "add x20, x20, x10\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
"add x16, x16, #0x60\n"
"tbz %x[n_channels], #1, 4f\n"
"ld1 { v5.d }[0], [x9], #0x8\n"
@@ -609,9 +609,9 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
"mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
"ldr x20, [x15, #0x50]\n"
- "add x20, x20, x10\n"
"mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
"mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "add x20, x20, x14\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v9.4s\n"
"fmla v30.4s, v1.4s, v8.4s\n"
@@ -630,9 +630,9 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x58]\n"
"fmla v31.4s, v2.4s, v5.4s\n"
"fmla v28.4s, v3.4s, v11.4s\n"
- "add x20, x20, x10\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v5.4s\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 8f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 9f\n"
@@ -644,7 +644,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x20, [x15, #0x60]\n"
"fmla v31.4s, v3.4s, v6.4s\n"
"fmla v28.4s, v4.4s, v12.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 10f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 11f\n"
@@ -658,11 +658,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v30.4s, v4.4s, v6.4s\n"
"ldr x20, [x15, #0x68]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v0.4s, v7.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"fmla v29.4s, v0.4s, v8.4s\n"
"fmla v30.4s, v0.4s, v14.4s\n"
- "add x16, x16, #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 13f\n"
@@ -674,11 +674,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0x70]\n"
"fmla v31.4s, v0.4s, v11.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v1.4s, v8.4s\n"
"fmla v29.4s, v1.4s, v13.4s\n"
"fmla v30.4s, v1.4s, v11.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 14f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 15f\n"
@@ -690,11 +690,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0x78]\n"
"fmla v31.4s, v1.4s, v12.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v2.4s, v13.4s\n"
"fmla v29.4s, v2.4s, v5.4s\n"
"fmla v30.4s, v2.4s, v12.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 16f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
@@ -706,11 +706,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0x80]\n"
"fmla v31.4s, v2.4s, v9.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v3.4s, v5.4s\n"
"fmla v29.4s, v3.4s, v6.4s\n"
"fmla v30.4s, v3.4s, v9.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 18f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 19f\n"
@@ -722,11 +722,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0x88]\n"
"fmla v31.4s, v3.4s, v13.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v4.4s, v6.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v13.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 20f\n"
"ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
@@ -738,10 +738,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0x90]\n"
"fmla v31.4s, v4.4s, v8.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v0.4s, v14.4s\n"
"fmla v29.4s, v0.4s, v11.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 22f\n"
"ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 23f\n"
@@ -752,7 +752,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"23:" // Oddments: Load input (3, 0): Bit 1: End
"ldr x20, [x15, #0x98]\n"
"fmla v30.4s, v0.4s, v5.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 24f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 25f\n"
@@ -764,11 +764,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0xa0]\n"
"fmla v31.4s, v0.4s, v6.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v1.4s, v11.4s\n"
"fmla v29.4s, v1.4s, v12.4s\n"
"fmla v30.4s, v1.4s, v6.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 26f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 27f\n"
@@ -780,11 +780,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0xa8]\n"
"fmla v31.4s, v1.4s, v10.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v2.4s, v12.4s\n"
"fmla v29.4s, v2.4s, v9.4s\n"
"fmla v30.4s, v2.4s, v10.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 28f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 29f\n"
@@ -796,11 +796,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0xb0]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v3.4s, v9.4s\n"
"fmla v29.4s, v3.4s, v13.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 30f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 31f\n"
@@ -812,11 +812,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0xb8]\n"
"fmla v31.4s, v3.4s, v12.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v4.4s, v13.4s\n"
"fmla v29.4s, v4.4s, v8.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 32f\n"
"ld1 { v14.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 33f\n"
@@ -828,10 +828,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0xc0]\n"
"fmla v31.4s, v4.4s, v14.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v0.4s, v5.4s\n"
"fmla v29.4s, v0.4s, v6.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 34f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 35f\n"
@@ -842,7 +842,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"35:" // Oddments: Load input (4, 0): Bit 1: End
"ldr x20, [x15, #0xc8]\n"
"fmla v30.4s, v0.4s, v9.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 36f\n"
"ld1 { v13.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 37f\n"
@@ -854,11 +854,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0xd0]\n"
"fmla v31.4s, v0.4s, v13.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v1.4s, v6.4s\n"
"fmla v29.4s, v1.4s, v10.4s\n"
"fmla v30.4s, v1.4s, v13.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 38f\n"
"ld1 { v5.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 39f\n"
@@ -870,11 +870,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0xd8]\n"
"fmla v31.4s, v1.4s, v5.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v2.4s, v10.4s\n"
"fmla v29.4s, v2.4s, v11.4s\n"
"fmla v30.4s, v2.4s, v5.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 40f\n"
"ld1 { v6.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 41f\n"
@@ -886,11 +886,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0xe0]\n"
"fmla v31.4s, v2.4s, v6.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v3.4s, v11.4s\n"
"fmla v29.4s, v3.4s, v12.4s\n"
"fmla v30.4s, v3.4s, v6.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 42f\n"
"ld1 { v8.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 43f\n"
@@ -902,11 +902,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q4, [x16, #0x0]\n"
"ldr x20, [x15, #0xe8]\n"
"fmla v31.4s, v3.4s, v8.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v4.4s, v12.4s\n"
"fmla v29.4s, v4.4s, v14.4s\n"
"fmla v30.4s, v4.4s, v8.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 44f\n"
"ld1 { v10.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 45f\n"
@@ -918,10 +918,10 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q0, [x16, #0x0]\n"
"ldr x20, [x15, #0xf0]\n"
"fmla v31.4s, v4.4s, v10.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v0.4s, v9.4s\n"
"fmla v29.4s, v0.4s, v13.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 46f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 47f\n"
@@ -932,7 +932,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"47:" // Oddments: Load input (5, 0): Bit 1: End
"ldr x20, [x15, #0xf8]\n"
"fmla v30.4s, v0.4s, v11.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 48f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 49f\n"
@@ -944,11 +944,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q1, [x16, #0x0]\n"
"ldr x20, [x15, #0x100]\n"
"fmla v31.4s, v0.4s, v12.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v1.4s, v13.4s\n"
"fmla v29.4s, v1.4s, v5.4s\n"
"fmla v30.4s, v1.4s, v12.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 50f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 51f\n"
@@ -960,11 +960,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q2, [x16, #0x0]\n"
"ldr x20, [x15, #0x108]\n"
"fmla v31.4s, v1.4s, v9.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v2.4s, v5.4s\n"
"fmla v29.4s, v2.4s, v6.4s\n"
"fmla v30.4s, v2.4s, v9.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 52f\n"
"ld1 { v11.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 53f\n"
@@ -976,11 +976,11 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr q3, [x16, #0x0]\n"
"ldr x20, [x15, #0x110]\n"
"fmla v31.4s, v2.4s, v11.4s\n"
+ "add x16, x16, #0x10\n"
"fmla v28.4s, v3.4s, v6.4s\n"
"fmla v29.4s, v3.4s, v8.4s\n"
"fmla v30.4s, v3.4s, v11.4s\n"
- "add x20, x20, x10\n"
- "add x16, x16, #0x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 54f\n"
"ld1 { v12.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 55f\n"
@@ -995,7 +995,7 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmla v28.4s, v4.4s, v8.4s\n"
"fmla v29.4s, v4.4s, v10.4s\n"
"fmla v30.4s, v4.4s, v12.4s\n"
- "add x20, x20, x10\n"
+ "add x20, x20, x14\n"
"tbz %x[n_channels], #1, 56f\n"
"ld1 { v9.d }[0], [x20], #0x8\n"
"tbz %x[n_channels], #0, 57f\n"
@@ -1008,32 +1008,32 @@ void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
"fmax v28.4s, v28.4s, v27.4s\n"
"fmax v29.4s, v29.4s, v27.4s\n"
"fmax v30.4s, v30.4s, v27.4s\n"
- "fmax v31.4s, v31.4s, v27.4s\n"
"fmin v28.4s, v28.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
"fmin v29.4s, v29.4s, v15.4s\n"
"fmin v30.4s, v30.4s, v15.4s\n"
"fmin v31.4s, v31.4s, v15.4s\n"
"tbz %x[n_channels], #1, 58f\n"
- "st1 { v28.d }[0], [x14], #0x8\n"
- "st1 { v29.d }[0], [x13], #0x8\n"
- "st1 { v30.d }[0], [x12], #0x8\n"
- "st1 { v31.d }[0], [x11], #0x8\n"
+ "st1 { v28.d }[0], [x13], #0x8\n"
+ "st1 { v29.d }[0], [x12], #0x8\n"
+ "st1 { v30.d }[0], [x11], #0x8\n"
+ "st1 { v31.d }[0], [x10], #0x8\n"
"tbz %x[n_channels], #0, 59f\n"
- "st1 { v28.s }[2], [x14], #0x4\n"
- "st1 { v29.s }[2], [x13], #0x4\n"
- "st1 { v30.s }[2], [x12], #0x4\n"
- "st1 { v31.s }[2], [x11], #0x4\n"
+ "st1 { v28.s }[2], [x13], #0x4\n"
+ "st1 { v29.s }[2], [x12], #0x4\n"
+ "st1 { v30.s }[2], [x11], #0x4\n"
+ "st1 { v31.s }[2], [x10], #0x4\n"
"b 59f\n"
"58:" // Oddments: Store: Bit 1: Unset
- "st1 { v28.s }[0], [x14], #0x4\n"
- "st1 { v29.s }[0], [x13], #0x4\n"
- "st1 { v30.s }[0], [x12], #0x4\n"
- "st1 { v31.s }[0], [x11], #0x4\n"
+ "st1 { v28.s }[0], [x13], #0x4\n"
+ "st1 { v29.s }[0], [x12], #0x4\n"
+ "st1 { v30.s }[0], [x11], #0x4\n"
+ "st1 { v31.s }[0], [x10], #0x4\n"
"59:" // Oddments: Store: Bit 1: End
"60:" // End
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index a2f577784f..6fb4ce79f0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,97 +56,97 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x11]\n"
"2:" // Channel loop: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
- "mov x26, %x[inptrs]\n"
- "ldp x21, x20, [x26], #0x10\n"
- "subs x25, %x[n_points], #0x1\n"
- "ldr q14, [x21, x11]\n"
- "ldr q15, [x20, x11]\n"
+ "mov x23, %x[inptrs]\n"
+ "subs x22, %x[n_points], #0x1\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x21, x20, [x26], #0x10\n"
- "ldr q16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
+ "add %x[params], %x[params], #0x10\n"
"mov v27.16b, v23.16b\n"
- "ldr q17, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"mov v28.16b, v23.16b\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov v29.16b, v23.16b\n"
- "ldr q18, [x21, x11]\n"
- "ldr q19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x21, x20, [x26], #0x10\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ldr q20, [x21, x11]\n"
- "add %x[params], %x[params], #0x10\n"
"ldr q21, [x20, x11]\n"
- "ldr x20, [x26], #0x8\n"
+ "ldr x20, [x23], #0x8\n"
"ldr q22, [x20, x11]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x20, x24, [x26], #0x10\n"
- "ldp x23, x22, [x26], #0x10\n"
- "subs x25, x25, #0x1\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "subs x22, x22, #0x1\n"
"fmla v23.4s, v14.4s, v0.4s\n"
- "ldr q14, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"fmla v24.4s, v15.4s, v0.4s\n"
"fmla v25.4s, v16.4s, v0.4s\n"
- "ldr q15, [x24, x11]\n"
- "ldr q16, [x23, x11]\n"
"fmla v26.4s, v17.4s, v0.4s\n"
"fmla v27.4s, v18.4s, v0.4s\n"
- "ldr q17, [x22, x11]\n"
- "ldr q18, [x21, x11]\n"
"fmla v28.4s, v19.4s, v0.4s\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"fmla v29.4s, v20.4s, v0.4s\n"
- "ldr q19, [x20, x11]\n"
- "ldp x21, x20, [x26], #0x10\n"
"fmla v30.4s, v21.4s, v0.4s\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
- "ldr q20, [x21, x11]\n"
"add %x[params], %x[params], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ldr q20, [x21, x11]\n"
"ldr q21, [x20, x11]\n"
- "ldr x20, [x26], #0x8\n"
+ "ldr x20, [x23], #0x8\n"
"ldr q22, [x20, x11]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla v23.4s, v14.4s, v0.4s\n"
"fmla v24.4s, v15.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v2.4s\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
"fmla v25.4s, v16.4s, v0.4s\n"
"fmla v26.4s, v17.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v2.4s\n"
- "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v27.4s, v18.4s, v0.4s\n"
"fmla v28.4s, v19.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v2.4s\n"
- "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"fmla v29.4s, v20.4s, v0.4s\n"
"fmla v30.4s, v21.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v2.4s\n"
- "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
"fmax v27.4s, v27.4s, v2.4s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
"fmax v28.4s, v28.4s, v2.4s\n"
"fmax v29.4s, v29.4s, v2.4s\n"
"fmax v30.4s, v30.4s, v2.4s\n"
"fmax v31.4s, v31.4s, v2.4s\n"
"fmin v23.4s, v23.4s, v1.4s\n"
"fmin v24.4s, v24.4s, v1.4s\n"
- "str q23, [x28, x11]\n"
"fmin v25.4s, v25.4s, v1.4s\n"
"fmin v26.4s, v26.4s, v1.4s\n"
- "str q24, [x27, x11]\n"
"fmin v27.4s, v27.4s, v1.4s\n"
"fmin v28.4s, v28.4s, v1.4s\n"
- "str q25, [x26, x11]\n"
"fmin v29.4s, v29.4s, v1.4s\n"
"fmin v30.4s, v30.4s, v1.4s\n"
- "str q26, [x25, x11]\n"
+ "str q23, [x28, x11]\n"
"fmin v31.4s, v31.4s, v1.4s\n"
+ "str q24, [x27, x11]\n"
+ "str q25, [x26, x11]\n"
+ "str q26, [x25, x11]\n"
"str q27, [x24, x11]\n"
"str q28, [x23, x11]\n"
"str q29, [x22, x11]\n"
@@ -172,29 +172,29 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"8:" // Oddments: Load bias: Done
"ldr q0, [%x[params], #0x0]\n"
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x27, x26, [x10], #0x10\n"
- "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x23, x22, [x10], #0x10\n"
- "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
+ "add %x[params], %x[params], #0x10\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
"add x9, x9, x11\n"
"add x28, x28, x11\n"
- "mov v31.16b, v23.16b\n"
+ "ldp x25, x24, [x10], #0x10\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 9f\n"
"ldr d14, [x9], #0x8\n"
"ldr d15, [x28], #0x8\n"
@@ -231,30 +231,30 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"ble 14f\n"
"11:" // Oddments: Planar loop
"ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
"fmla v23.4s, v14.4s, v0.4s\n"
"fmla v24.4s, v15.4s, v0.4s\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x10], #0x10\n"
"fmla v25.4s, v16.4s, v0.4s\n"
"fmla v26.4s, v17.4s, v0.4s\n"
- "ldr x21, [x10], #0x8\n"
"fmla v27.4s, v18.4s, v0.4s\n"
"fmla v28.4s, v19.4s, v0.4s\n"
- "add x9, x9, x11\n"
+ "ldp x27, x26, [x10], #0x10\n"
"fmla v29.4s, v20.4s, v0.4s\n"
"fmla v30.4s, v21.4s, v0.4s\n"
+ "add x9, x9, x11\n"
"add x28, x28, x11\n"
- "add x27, x27, x11\n"
"fmla v31.4s, v22.4s, v0.4s\n"
"ldr q0, [%x[params], #0x0]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
- "add %x[params], %x[params], #0x10\n"
"tbz %x[n_channels], #1, 12f\n"
"ldr d14, [x9], #0x8\n"
"ldr d15, [x28], #0x8\n"
@@ -292,40 +292,40 @@ void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
"14:" // Oddments: Planar tail
"fmla v23.4s, v14.4s, v0.4s\n"
"fmla v24.4s, v15.4s, v0.4s\n"
- "fmax v23.4s, v23.4s, v2.4s\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
"fmla v25.4s, v16.4s, v0.4s\n"
"fmla v26.4s, v17.4s, v0.4s\n"
- "fmax v24.4s, v24.4s, v2.4s\n"
- "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla v27.4s, v18.4s, v0.4s\n"
"fmla v28.4s, v19.4s, v0.4s\n"
- "fmax v25.4s, v25.4s, v2.4s\n"
- "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"fmla v29.4s, v20.4s, v0.4s\n"
"fmla v30.4s, v21.4s, v0.4s\n"
- "fmax v26.4s, v26.4s, v2.4s\n"
- "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
"fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
"fmax v27.4s, v27.4s, v2.4s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "add x28, x28, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
"fmax v28.4s, v28.4s, v2.4s\n"
"fmax v29.4s, v29.4s, v2.4s\n"
- "add x27, x27, x11\n"
- "add x26, x26, x11\n"
+ "add x20, x20, x11\n"
"fmax v30.4s, v30.4s, v2.4s\n"
"fmax v31.4s, v31.4s, v2.4s\n"
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
"fmin v23.4s, v23.4s, v1.4s\n"
"fmin v24.4s, v24.4s, v1.4s\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
"fmin v25.4s, v25.4s, v1.4s\n"
"fmin v26.4s, v26.4s, v1.4s\n"
- "add x21, x21, x11\n"
- "add x20, x20, x11\n"
"fmin v27.4s, v27.4s, v1.4s\n"
"fmin v28.4s, v28.4s, v1.4s\n"
"fmin v29.4s, v29.4s, v1.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 9cafd23fb8..ac255d149f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,49 +43,49 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
__asm__ __volatile__(
"ld1r { v27.4s }, [%x[clamps]]\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "lsr x22, %x[channel_multiplier], #0x2\n"
+ "ldr x25, [%x[inptrs], #0x0]\n"
+ "lsr x24, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
- "ldr q0, [x21, #0x0]\n"
- "ldr q1, [x21, #0x10]\n"
- "mov x21, #0x0\n"
- "mov x14, #0x0\n"
"ld1r { v26.4s }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr q2, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ldr q4, [x20, #0x0]\n"
- "ldr q5, [x20, #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x8]\n"
+ "mov x23, #0x0\n"
+ "mov x15, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x10]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x25, #0x10]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x22, #0x10]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q5, [x21, #0x10]\n"
"ldr q6, [x20, #0x0]\n"
"ldr q7, [x20, #0x10]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ldr q8, [x20, #0x0]\n"
- "ldr q9, [x20, #0x10]\n"
- "ldr x20, [%x[inptrs], #0x28]\n"
- "ldr q10, [x20, #0x0]\n"
- "ldr q11, [x20, #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x20]\n"
+ "ldr x21, [%x[inptrs], #0x28]\n"
"ldr x20, [%x[inptrs], #0x30]\n"
+ "ldp x14, x13, [%x[outptrs], #0x0]\n"
+ "ldp x12, x11, [%x[outptrs], #0x10]\n"
+ "ldp x10, x9, [%x[outptrs], #0x20]\n"
+ "ldr q8, [x22, #0x0]\n"
+ "ldr q9, [x22, #0x10]\n"
+ "ldr q10, [x21, #0x0]\n"
+ "ldr q11, [x21, #0x10]\n"
"ldr q12, [x20, #0x0]\n"
"ldr q13, [x20, #0x10]\n"
- "ldp x13, x12, [%x[outptrs], #0x0]\n"
- "ldp x11, x10, [%x[outptrs], #0x10]\n"
- "ldp x9, x28, [%x[outptrs], #0x20]\n"
- "ldp x27, x26, [%x[outptrs], #0x30]\n"
- "ldr x25, [%x[outptrs], #0x40]\n"
- "cbz x22, 3f\n"
+ "ldp x28, x27, [%x[outptrs], #0x30]\n"
+ "ldr x26, [%x[outptrs], #0x40]\n"
+ "cbz x24, 3f\n"
"ldr q14, [%x[params], #0x0]\n"
"ldr q31, [%x[params], #0x10]\n"
- "subs x22, x22, #0x1\n"
- "mov v15.16b, v14.16b\n"
+ "subs x24, x24, #0x1\n"
"ldr q30, [%x[params], #0x20]\n"
"ldr q29, [%x[params], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "mov v15.16b, v14.16b\n"
"mov v16.16b, v14.16b\n"
"mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
- "add %x[params], %x[params], #0x40\n"
"mov v20.16b, v14.16b\n"
"mov v21.16b, v14.16b\n"
"mov v22.16b, v14.16b\n"
@@ -93,8 +93,8 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"1:" // Output channel complete vector loop
"fmla v14.4s, v31.4s, v0.s[0]\n"
"fmla v15.4s, v31.4s, v0.s[2]\n"
- "subs x22, x22, #0x1\n"
- "add x21, x21, #0x4\n"
+ "subs x24, x24, #0x1\n"
+ "add x23, x23, #0x4\n"
"fmla v16.4s, v31.4s, v1.s[0]\n"
"fmla v17.4s, v31.4s, v4.s[0]\n"
"fmla v18.4s, v31.4s, v4.s[2]\n"
@@ -175,52 +175,52 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"ldr q30, [%x[params], #0x80]\n"
"fmla v14.4s, v23.4s, v4.s[2]\n"
"fmla v15.4s, v23.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v26.4s\n"
"fmla v16.4s, v23.4s, v5.s[2]\n"
"fmla v17.4s, v23.4s, v8.s[2]\n"
- "fmax v14.4s, v14.4s, v27.4s\n"
- "str q14, [x13, x14]\n"
- "ldr q14, [%x[params], #0x60]\n"
"fmla v18.4s, v23.4s, v9.s[0]\n"
"fmla v19.4s, v23.4s, v9.s[2]\n"
- "fmin v15.4s, v15.4s, v26.4s\n"
"fmla v20.4s, v23.4s, v12.s[2]\n"
"fmla v21.4s, v23.4s, v13.s[0]\n"
- "fmin v16.4s, v16.4s, v26.4s\n"
"fmla v22.4s, v23.4s, v13.s[2]\n"
"ldr q29, [%x[params], #0x90]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
"fmin v17.4s, v17.4s, v26.4s\n"
- "add %x[params], %x[params], #0xa0\n"
"fmin v18.4s, v18.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
"fmin v19.4s, v19.4s, v26.4s\n"
"fmin v20.4s, v20.4s, v26.4s\n"
"fmin v21.4s, v21.4s, v26.4s\n"
"fmin v22.4s, v22.4s, v26.4s\n"
"fmax v15.4s, v15.4s, v27.4s\n"
- "str q15, [x12, x14]\n"
+ "str q14, [x14, x15]\n"
+ "ldr q14, [%x[params], #0x60]\n"
"fmax v16.4s, v16.4s, v27.4s\n"
"fmax v17.4s, v17.4s, v27.4s\n"
- "str q16, [x11, x14]\n"
+ "add %x[params], %x[params], #0xa0\n"
"fmax v18.4s, v18.4s, v27.4s\n"
"fmax v19.4s, v19.4s, v27.4s\n"
- "str q17, [x10, x14]\n"
"fmax v20.4s, v20.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "str q18, [x9, x14]\n"
+ "str q15, [x13, x15]\n"
"fmax v22.4s, v22.4s, v27.4s\n"
- "str q19, [x28, x14]\n"
+ "str q16, [x12, x15]\n"
"mov v15.16b, v14.16b\n"
- "str q20, [x27, x14]\n"
+ "str q17, [x11, x15]\n"
"mov v16.16b, v14.16b\n"
"mov v17.16b, v14.16b\n"
- "str q21, [x26, x14]\n"
+ "str q18, [x10, x15]\n"
"mov v18.16b, v14.16b\n"
+ "str q19, [x9, x15]\n"
"mov v19.16b, v14.16b\n"
- "str q22, [x25, x14]\n"
+ "str q20, [x28, x15]\n"
"mov v20.16b, v14.16b\n"
+ "str q21, [x27, x15]\n"
"mov v21.16b, v14.16b\n"
- "add x14, x14, #0x10\n"
+ "str q22, [x26, x15]\n"
"mov v22.16b, v14.16b\n"
+ "add x15, x15, #0x10\n"
"bgt 1b\n"
"2:" // Output channel complete vector tail
"fmla v14.4s, v31.4s, v0.s[0]\n"
@@ -304,17 +304,17 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v22.4s, v24.4s, v13.s[1]\n"
"fmla v14.4s, v23.4s, v4.s[2]\n"
"fmla v15.4s, v23.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v26.4s\n"
"fmla v16.4s, v23.4s, v5.s[2]\n"
"fmla v17.4s, v23.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v26.4s\n"
"fmla v18.4s, v23.4s, v9.s[0]\n"
"fmla v19.4s, v23.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v26.4s\n"
"fmla v20.4s, v23.4s, v12.s[2]\n"
"fmla v21.4s, v23.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v26.4s\n"
"fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
"fmin v18.4s, v18.4s, v26.4s\n"
"fmin v19.4s, v19.4s, v26.4s\n"
"fmin v20.4s, v20.4s, v26.4s\n"
@@ -322,32 +322,32 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmin v22.4s, v22.4s, v26.4s\n"
"fmax v14.4s, v14.4s, v27.4s\n"
"fmax v15.4s, v15.4s, v27.4s\n"
- "str q14, [x13, x14]\n"
"fmax v16.4s, v16.4s, v27.4s\n"
"fmax v17.4s, v17.4s, v27.4s\n"
- "str q15, [x12, x14]\n"
"fmax v18.4s, v18.4s, v27.4s\n"
"fmax v19.4s, v19.4s, v27.4s\n"
- "str q16, [x11, x14]\n"
"fmax v20.4s, v20.4s, v27.4s\n"
"fmax v21.4s, v21.4s, v27.4s\n"
- "str q17, [x10, x14]\n"
+ "str q14, [x14, x15]\n"
"fmax v22.4s, v22.4s, v27.4s\n"
- "str q18, [x9, x14]\n"
- "str q19, [x28, x14]\n"
- "str q20, [x27, x14]\n"
- "str q21, [x26, x14]\n"
- "str q22, [x25, x14]\n"
- "add x14, x14, #0x10\n"
+ "str q15, [x13, x15]\n"
+ "str q16, [x12, x15]\n"
+ "str q17, [x11, x15]\n"
+ "str q18, [x10, x15]\n"
+ "str q19, [x9, x15]\n"
+ "str q20, [x28, x15]\n"
+ "str q21, [x27, x15]\n"
+ "str q22, [x26, x15]\n"
+ "add x15, x15, #0x10\n"
"3:" // Output channel oddments
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q14, [%x[params], #0x0]\n"
"ldr q25, [%x[params], #0x10]\n"
- "mov v15.16b, v14.16b\n"
- "mov v16.16b, v14.16b\n"
"ldr q24, [%x[params], #0x20]\n"
"ldr q23, [%x[params], #0x30]\n"
+ "mov v15.16b, v14.16b\n"
+ "mov v16.16b, v14.16b\n"
"mov v17.16b, v14.16b\n"
"mov v18.16b, v14.16b\n"
"mov v19.16b, v14.16b\n"
@@ -435,17 +435,17 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla v22.4s, v24.4s, v13.s[1]\n"
"fmla v14.4s, v23.4s, v4.s[2]\n"
"fmla v15.4s, v23.4s, v5.s[0]\n"
- "fmin v14.4s, v14.4s, v26.4s\n"
"fmla v16.4s, v23.4s, v5.s[2]\n"
"fmla v17.4s, v23.4s, v8.s[2]\n"
- "fmin v15.4s, v15.4s, v26.4s\n"
"fmla v18.4s, v23.4s, v9.s[0]\n"
"fmla v19.4s, v23.4s, v9.s[2]\n"
- "fmin v16.4s, v16.4s, v26.4s\n"
"fmla v20.4s, v23.4s, v12.s[2]\n"
"fmla v21.4s, v23.4s, v13.s[0]\n"
- "fmin v17.4s, v17.4s, v26.4s\n"
"fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
"fmin v18.4s, v18.4s, v26.4s\n"
"fmin v19.4s, v19.4s, v26.4s\n"
"fmin v20.4s, v20.4s, v26.4s\n"
@@ -461,39 +461,39 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmax v21.4s, v21.4s, v27.4s\n"
"fmax v22.4s, v22.4s, v27.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
- "add x20, x13, x14\n"
- "add x22, x12, x14\n"
- "st1 { v14.d }[0], [x20]\n"
- "add x21, x11, x14\n"
- "add x20, x10, x14\n"
- "st1 { v15.d }[0], [x22]\n"
- "add x24, x9, x14\n"
- "add x23, x28, x14\n"
- "st1 { v16.d }[0], [x21]\n"
- "add x22, x27, x14\n"
- "add x21, x26, x14\n"
- "st1 { v17.d }[0], [x20]\n"
- "add x20, x25, x14\n"
+ "add x22, x14, x15\n"
+ "add x21, x13, x15\n"
+ "add x20, x12, x15\n"
+ "add x25, x11, x15\n"
+ "st1 { v14.d }[0], [x22]\n"
+ "add x24, x10, x15\n"
+ "add x23, x9, x15\n"
+ "st1 { v15.d }[0], [x21]\n"
+ "add x22, x28, x15\n"
+ "add x21, x27, x15\n"
+ "st1 { v16.d }[0], [x20]\n"
+ "add x20, x26, x15\n"
+ "st1 { v17.d }[0], [x25]\n"
+ "add x15, x15, #0x8\n"
"st1 { v18.d }[0], [x24]\n"
- "add x14, x14, #0x8\n"
"st1 { v19.d }[0], [x23]\n"
"st1 { v20.d }[0], [x22]\n"
"st1 { v21.d }[0], [x21]\n"
"st1 { v22.d }[0], [x20]\n"
"tbz %x[channel_multiplier], #0, 5f\n"
- "add x20, x13, x14\n"
- "add x22, x12, x14\n"
- "st1 { v14.s }[2], [x20]\n"
- "add x21, x11, x14\n"
- "add x20, x10, x14\n"
- "st1 { v15.s }[2], [x22]\n"
- "add x24, x9, x14\n"
- "add x23, x28, x14\n"
- "st1 { v16.s }[2], [x21]\n"
- "add x22, x27, x14\n"
- "add x21, x26, x14\n"
- "st1 { v17.s }[2], [x20]\n"
- "add x20, x25, x14\n"
+ "add x22, x14, x15\n"
+ "add x21, x13, x15\n"
+ "add x20, x12, x15\n"
+ "add x25, x11, x15\n"
+ "st1 { v14.s }[2], [x22]\n"
+ "add x24, x10, x15\n"
+ "add x23, x9, x15\n"
+ "st1 { v15.s }[2], [x21]\n"
+ "add x22, x28, x15\n"
+ "add x21, x27, x15\n"
+ "st1 { v16.s }[2], [x20]\n"
+ "add x20, x26, x15\n"
+ "st1 { v17.s }[2], [x25]\n"
"st1 { v18.s }[2], [x24]\n"
"st1 { v19.s }[2], [x23]\n"
"st1 { v20.s }[2], [x22]\n"
@@ -501,19 +501,19 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"st1 { v22.s }[2], [x20]\n"
"b 5f\n"
"4:" // Output channel oddments: Store: Bit 1: Unset
- "add x20, x13, x14\n"
- "add x22, x12, x14\n"
- "st1 { v14.s }[0], [x20]\n"
- "add x21, x11, x14\n"
- "add x20, x10, x14\n"
- "st1 { v15.s }[0], [x22]\n"
- "add x24, x9, x14\n"
- "add x23, x28, x14\n"
- "st1 { v16.s }[0], [x21]\n"
- "add x22, x27, x14\n"
- "add x21, x26, x14\n"
- "st1 { v17.s }[0], [x20]\n"
- "add x20, x25, x14\n"
+ "add x22, x14, x15\n"
+ "add x21, x13, x15\n"
+ "add x20, x12, x15\n"
+ "add x25, x11, x15\n"
+ "st1 { v14.s }[0], [x22]\n"
+ "add x24, x10, x15\n"
+ "add x23, x9, x15\n"
+ "st1 { v15.s }[0], [x21]\n"
+ "add x22, x28, x15\n"
+ "add x21, x27, x15\n"
+ "st1 { v16.s }[0], [x20]\n"
+ "add x20, x26, x15\n"
+ "st1 { v17.s }[0], [x25]\n"
"st1 { v18.s }[0], [x24]\n"
"st1 { v19.s }[0], [x23]\n"
"st1 { v20.s }[0], [x22]\n"
@@ -523,7 +523,7 @@ void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"6:" // End
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index c9bb1f41da..2f6a399d67 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,55 +43,55 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
__asm__ __volatile__(
"ld1r { v26.4s }, [%x[clamps]]\n"
- "ldr x21, [%x[inptrs], #0x0]\n"
- "lsr x22, %x[channel_multiplier], #0x2\n"
+ "ldr x25, [%x[inptrs], #0x0]\n"
+ "lsr x24, %x[channel_multiplier], #0x2\n"
"add x20, %x[clamps], #0x4\n"
- "ldr q0, [x21, #0x0]\n"
- "ldr q1, [x21, #0x10]\n"
- "mov x21, #0x0\n"
- "mov x13, #0x0\n"
"ld1r { v25.4s }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ldr q2, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ldr q4, [x20, #0x0]\n"
- "ldr q5, [x20, #0x10]\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "mov x22, #0x0\n"
+ "mov x14, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x10]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x25, #0x10]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x23, #0x10]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q5, [x21, #0x10]\n"
"ldr q6, [x20, #0x0]\n"
"ldr q7, [x20, #0x10]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ldr q8, [x20, #0x0]\n"
- "ldr q9, [x20, #0x10]\n"
+ "ldr x21, [%x[inptrs], #0x20]\n"
"ldr x20, [%x[inptrs], #0x28]\n"
+ "ldp x13, x12, [%x[outptrs], #0x0]\n"
+ "ldp x11, x10, [%x[outptrs], #0x10]\n"
+ "ldp x9, x28, [%x[outptrs], #0x20]\n"
+ "ldp x27, x26, [%x[outptrs], #0x30]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q9, [x21, #0x10]\n"
"ldr q10, [x20, #0x0]\n"
"ldr q11, [x20, #0x10]\n"
- "ldp x12, x11, [%x[outptrs], #0x0]\n"
- "ldp x10, x9, [%x[outptrs], #0x10]\n"
- "ldp x28, x27, [%x[outptrs], #0x20]\n"
- "ldp x26, x25, [%x[outptrs], #0x30]\n"
- "cbz x22, 3f\n"
+ "cbz x24, 3f\n"
"ldr q12, [%x[params], #0x0]\n"
"ldr q31, [%x[params], #0x10]\n"
- "subs x22, x22, #0x1\n"
- "mov v13.16b, v12.16b\n"
+ "subs x24, x24, #0x1\n"
"ldr q30, [%x[params], #0x20]\n"
"ldr q29, [%x[params], #0x30]\n"
- "mov v14.16b, v12.16b\n"
- "mov v15.16b, v12.16b\n"
"ldr q28, [%x[params], #0x40]\n"
"ldr q27, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mov v13.16b, v12.16b\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v15.16b, v12.16b\n"
"mov v16.16b, v12.16b\n"
"mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
"mov v19.16b, v12.16b\n"
- "add %x[params], %x[params], #0x60\n"
"beq 2f\n"
"1:" // Output channel complete vector loop
"fmla v12.4s, v31.4s, v0.s[0]\n"
"fmla v13.4s, v31.4s, v0.s[1]\n"
- "subs x22, x22, #0x1\n"
- "add x21, x21, #0x4\n"
+ "subs x24, x24, #0x1\n"
+ "add x22, x22, #0x4\n"
"fmla v14.4s, v31.4s, v0.s[2]\n"
"fmla v15.4s, v31.4s, v0.s[3]\n"
"fmla v16.4s, v31.4s, v2.s[0]\n"
@@ -308,47 +308,47 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"ldr q28, [%x[params], #0x180]\n"
"fmla v12.4s, v20.4s, v9.s[0]\n"
"fmla v13.4s, v20.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v25.4s\n"
"fmla v14.4s, v20.4s, v9.s[2]\n"
"fmla v15.4s, v20.4s, v9.s[3]\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
- "str q12, [x12, x13]\n"
- "ldr q12, [%x[params], #0x140]\n"
"fmla v16.4s, v20.4s, v11.s[0]\n"
"fmla v17.4s, v20.4s, v11.s[1]\n"
- "fmin v13.4s, v13.4s, v25.4s\n"
"fmla v18.4s, v20.4s, v11.s[2]\n"
"fmla v19.4s, v20.4s, v11.s[3]\n"
"ldr q27, [%x[params], #0x190]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
"fmin v14.4s, v14.4s, v25.4s\n"
"fmin v15.4s, v15.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
"fmin v16.4s, v16.4s, v25.4s\n"
- "add %x[params], %x[params], #0x1a0\n"
"fmin v17.4s, v17.4s, v25.4s\n"
"fmin v18.4s, v18.4s, v25.4s\n"
"fmin v19.4s, v19.4s, v25.4s\n"
+ "str q12, [x13, x14]\n"
+ "ldr q12, [%x[params], #0x140]\n"
"fmax v13.4s, v13.4s, v26.4s\n"
- "str q13, [x11, x13]\n"
"fmax v14.4s, v14.4s, v26.4s\n"
+ "add %x[params], %x[params], #0x1a0\n"
"fmax v15.4s, v15.4s, v26.4s\n"
- "str q14, [x10, x13]\n"
"fmax v16.4s, v16.4s, v26.4s\n"
"fmax v17.4s, v17.4s, v26.4s\n"
- "str q15, [x9, x13]\n"
"fmax v18.4s, v18.4s, v26.4s\n"
"fmax v19.4s, v19.4s, v26.4s\n"
- "str q16, [x28, x13]\n"
- "str q17, [x27, x13]\n"
+ "str q13, [x12, x14]\n"
"mov v13.16b, v12.16b\n"
+ "str q14, [x11, x14]\n"
"mov v14.16b, v12.16b\n"
- "str q18, [x26, x13]\n"
+ "str q15, [x10, x14]\n"
"mov v15.16b, v12.16b\n"
+ "str q16, [x9, x14]\n"
"mov v16.16b, v12.16b\n"
- "str q19, [x25, x13]\n"
+ "str q17, [x28, x14]\n"
"mov v17.16b, v12.16b\n"
+ "str q18, [x27, x14]\n"
"mov v18.16b, v12.16b\n"
- "add x13, x13, #0x10\n"
+ "str q19, [x26, x14]\n"
"mov v19.16b, v12.16b\n"
+ "add x14, x14, #0x10\n"
"bgt 1b\n"
"2:" // Output channel complete vector tail
"fmla v12.4s, v31.4s, v0.s[0]\n"
@@ -566,55 +566,55 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v19.4s, v21.4s, v11.s[2]\n"
"fmla v12.4s, v20.4s, v9.s[0]\n"
"fmla v13.4s, v20.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v25.4s\n"
"fmla v14.4s, v20.4s, v9.s[2]\n"
"fmla v15.4s, v20.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v25.4s\n"
"fmla v16.4s, v20.4s, v11.s[0]\n"
"fmla v17.4s, v20.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v25.4s\n"
"fmla v18.4s, v20.4s, v11.s[2]\n"
"fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
"fmin v15.4s, v15.4s, v25.4s\n"
"fmin v16.4s, v16.4s, v25.4s\n"
"fmin v17.4s, v17.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
"fmin v18.4s, v18.4s, v25.4s\n"
"fmin v19.4s, v19.4s, v25.4s\n"
- "fmax v12.4s, v12.4s, v26.4s\n"
"fmax v13.4s, v13.4s, v26.4s\n"
- "str q12, [x12, x13]\n"
"fmax v14.4s, v14.4s, v26.4s\n"
"fmax v15.4s, v15.4s, v26.4s\n"
- "str q13, [x11, x13]\n"
"fmax v16.4s, v16.4s, v26.4s\n"
+ "str q12, [x13, x14]\n"
"fmax v17.4s, v17.4s, v26.4s\n"
- "str q14, [x10, x13]\n"
"fmax v18.4s, v18.4s, v26.4s\n"
"fmax v19.4s, v19.4s, v26.4s\n"
- "str q15, [x9, x13]\n"
- "str q16, [x28, x13]\n"
- "str q17, [x27, x13]\n"
- "str q18, [x26, x13]\n"
- "str q19, [x25, x13]\n"
- "add x13, x13, #0x10\n"
+ "str q13, [x12, x14]\n"
+ "str q14, [x11, x14]\n"
+ "str q15, [x10, x14]\n"
+ "str q16, [x9, x14]\n"
+ "str q17, [x28, x14]\n"
+ "str q18, [x27, x14]\n"
+ "str q19, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
"3:" // Output channel oddments
"tst %x[channel_multiplier], #0x3\n"
"beq 6f\n"
"ldr q12, [%x[params], #0x0]\n"
"ldr q24, [%x[params], #0x10]\n"
- "mov v13.16b, v12.16b\n"
- "mov v14.16b, v12.16b\n"
"ldr q23, [%x[params], #0x20]\n"
"ldr q22, [%x[params], #0x30]\n"
- "mov v15.16b, v12.16b\n"
- "mov v16.16b, v12.16b\n"
"ldr q21, [%x[params], #0x40]\n"
"ldr q20, [%x[params], #0x50]\n"
+ "mov v13.16b, v12.16b\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v15.16b, v12.16b\n"
+ "mov v16.16b, v12.16b\n"
"mov v17.16b, v12.16b\n"
"mov v18.16b, v12.16b\n"
+ "fmla v13.4s, v24.4s, v0.s[1]\n"
"mov v19.16b, v12.16b\n"
"fmla v12.4s, v24.4s, v0.s[0]\n"
- "fmla v13.4s, v24.4s, v0.s[1]\n"
"fmla v14.4s, v24.4s, v0.s[2]\n"
"fmla v15.4s, v24.4s, v0.s[3]\n"
"fmla v16.4s, v24.4s, v2.s[0]\n"
@@ -622,8 +622,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v24.4s, v2.s[2]\n"
"fmla v19.4s, v24.4s, v2.s[3]\n"
"ldr q24, [%x[params], #0x60]\n"
- "fmla v12.4s, v23.4s, v0.s[1]\n"
"fmla v13.4s, v23.4s, v0.s[2]\n"
+ "fmla v12.4s, v23.4s, v0.s[1]\n"
"fmla v14.4s, v23.4s, v0.s[3]\n"
"fmla v15.4s, v23.4s, v1.s[0]\n"
"fmla v16.4s, v23.4s, v2.s[1]\n"
@@ -631,8 +631,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v23.4s, v2.s[3]\n"
"fmla v19.4s, v23.4s, v3.s[0]\n"
"ldr q23, [%x[params], #0x70]\n"
- "fmla v12.4s, v22.4s, v0.s[2]\n"
"fmla v13.4s, v22.4s, v0.s[3]\n"
+ "fmla v12.4s, v22.4s, v0.s[2]\n"
"fmla v14.4s, v22.4s, v1.s[0]\n"
"fmla v15.4s, v22.4s, v1.s[1]\n"
"fmla v16.4s, v22.4s, v2.s[2]\n"
@@ -640,8 +640,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v22.4s, v3.s[0]\n"
"fmla v19.4s, v22.4s, v3.s[1]\n"
"ldr q22, [%x[params], #0x80]\n"
- "fmla v12.4s, v21.4s, v0.s[3]\n"
"fmla v13.4s, v21.4s, v1.s[0]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
"fmla v14.4s, v21.4s, v1.s[1]\n"
"fmla v15.4s, v21.4s, v1.s[2]\n"
"fmla v16.4s, v21.4s, v2.s[3]\n"
@@ -649,8 +649,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v21.4s, v3.s[1]\n"
"fmla v19.4s, v21.4s, v3.s[2]\n"
"ldr q21, [%x[params], #0x90]\n"
- "fmla v12.4s, v20.4s, v1.s[0]\n"
"fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v12.4s, v20.4s, v1.s[0]\n"
"fmla v14.4s, v20.4s, v1.s[2]\n"
"fmla v15.4s, v20.4s, v1.s[3]\n"
"fmla v16.4s, v20.4s, v3.s[0]\n"
@@ -658,8 +658,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v20.4s, v3.s[2]\n"
"fmla v19.4s, v20.4s, v3.s[3]\n"
"ldr q20, [%x[params], #0xa0]\n"
- "fmla v12.4s, v24.4s, v2.s[0]\n"
"fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
"fmla v14.4s, v24.4s, v2.s[2]\n"
"fmla v15.4s, v24.4s, v2.s[3]\n"
"fmla v16.4s, v24.4s, v4.s[0]\n"
@@ -667,8 +667,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v24.4s, v4.s[2]\n"
"fmla v19.4s, v24.4s, v4.s[3]\n"
"ldr q24, [%x[params], #0xb0]\n"
- "fmla v12.4s, v23.4s, v2.s[1]\n"
"fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
"fmla v14.4s, v23.4s, v2.s[3]\n"
"fmla v15.4s, v23.4s, v3.s[0]\n"
"fmla v16.4s, v23.4s, v4.s[1]\n"
@@ -676,8 +676,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v23.4s, v4.s[3]\n"
"fmla v19.4s, v23.4s, v5.s[0]\n"
"ldr q23, [%x[params], #0xc0]\n"
- "fmla v12.4s, v22.4s, v2.s[2]\n"
"fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
"fmla v14.4s, v22.4s, v3.s[0]\n"
"fmla v15.4s, v22.4s, v3.s[1]\n"
"fmla v16.4s, v22.4s, v4.s[2]\n"
@@ -685,8 +685,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v22.4s, v5.s[0]\n"
"fmla v19.4s, v22.4s, v5.s[1]\n"
"ldr q22, [%x[params], #0xd0]\n"
- "fmla v12.4s, v21.4s, v2.s[3]\n"
"fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
"fmla v14.4s, v21.4s, v3.s[1]\n"
"fmla v15.4s, v21.4s, v3.s[2]\n"
"fmla v16.4s, v21.4s, v4.s[3]\n"
@@ -694,8 +694,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v21.4s, v5.s[1]\n"
"fmla v19.4s, v21.4s, v5.s[2]\n"
"ldr q21, [%x[params], #0xe0]\n"
- "fmla v12.4s, v20.4s, v3.s[0]\n"
"fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
"fmla v14.4s, v20.4s, v3.s[2]\n"
"fmla v15.4s, v20.4s, v3.s[3]\n"
"fmla v16.4s, v20.4s, v5.s[0]\n"
@@ -703,8 +703,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v20.4s, v5.s[2]\n"
"fmla v19.4s, v20.4s, v5.s[3]\n"
"ldr q20, [%x[params], #0xf0]\n"
- "fmla v12.4s, v24.4s, v4.s[0]\n"
"fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
"fmla v14.4s, v24.4s, v4.s[2]\n"
"fmla v15.4s, v24.4s, v4.s[3]\n"
"fmla v16.4s, v24.4s, v6.s[0]\n"
@@ -712,8 +712,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v24.4s, v6.s[2]\n"
"fmla v19.4s, v24.4s, v6.s[3]\n"
"ldr q24, [%x[params], #0x100]\n"
- "fmla v12.4s, v23.4s, v4.s[1]\n"
"fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
"fmla v14.4s, v23.4s, v4.s[3]\n"
"fmla v15.4s, v23.4s, v5.s[0]\n"
"fmla v16.4s, v23.4s, v6.s[1]\n"
@@ -721,8 +721,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v23.4s, v6.s[3]\n"
"fmla v19.4s, v23.4s, v7.s[0]\n"
"ldr q23, [%x[params], #0x110]\n"
- "fmla v12.4s, v22.4s, v4.s[2]\n"
"fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
"fmla v14.4s, v22.4s, v5.s[0]\n"
"fmla v15.4s, v22.4s, v5.s[1]\n"
"fmla v16.4s, v22.4s, v6.s[2]\n"
@@ -730,8 +730,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v22.4s, v7.s[0]\n"
"fmla v19.4s, v22.4s, v7.s[1]\n"
"ldr q22, [%x[params], #0x120]\n"
- "fmla v12.4s, v21.4s, v4.s[3]\n"
"fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
"fmla v14.4s, v21.4s, v5.s[1]\n"
"fmla v15.4s, v21.4s, v5.s[2]\n"
"fmla v16.4s, v21.4s, v6.s[3]\n"
@@ -739,8 +739,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v21.4s, v7.s[1]\n"
"fmla v19.4s, v21.4s, v7.s[2]\n"
"ldr q21, [%x[params], #0x130]\n"
- "fmla v12.4s, v20.4s, v5.s[0]\n"
"fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
"fmla v14.4s, v20.4s, v5.s[2]\n"
"fmla v15.4s, v20.4s, v5.s[3]\n"
"fmla v16.4s, v20.4s, v7.s[0]\n"
@@ -748,8 +748,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v20.4s, v7.s[2]\n"
"fmla v19.4s, v20.4s, v7.s[3]\n"
"ldr q20, [%x[params], #0x140]\n"
- "fmla v12.4s, v24.4s, v6.s[0]\n"
"fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
"fmla v14.4s, v24.4s, v6.s[2]\n"
"fmla v15.4s, v24.4s, v6.s[3]\n"
"fmla v16.4s, v24.4s, v8.s[0]\n"
@@ -757,8 +757,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v24.4s, v8.s[2]\n"
"fmla v19.4s, v24.4s, v8.s[3]\n"
"ldr q24, [%x[params], #0x150]\n"
- "fmla v12.4s, v23.4s, v6.s[1]\n"
"fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
"fmla v14.4s, v23.4s, v6.s[3]\n"
"fmla v15.4s, v23.4s, v7.s[0]\n"
"fmla v16.4s, v23.4s, v8.s[1]\n"
@@ -766,8 +766,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v23.4s, v8.s[3]\n"
"fmla v19.4s, v23.4s, v9.s[0]\n"
"ldr q23, [%x[params], #0x160]\n"
- "fmla v12.4s, v22.4s, v6.s[2]\n"
"fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
"fmla v14.4s, v22.4s, v7.s[0]\n"
"fmla v15.4s, v22.4s, v7.s[1]\n"
"fmla v16.4s, v22.4s, v8.s[2]\n"
@@ -775,8 +775,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v22.4s, v9.s[0]\n"
"fmla v19.4s, v22.4s, v9.s[1]\n"
"ldr q22, [%x[params], #0x170]\n"
- "fmla v12.4s, v21.4s, v6.s[3]\n"
"fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
"fmla v14.4s, v21.4s, v7.s[1]\n"
"fmla v15.4s, v21.4s, v7.s[2]\n"
"fmla v16.4s, v21.4s, v8.s[3]\n"
@@ -784,8 +784,8 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v21.4s, v9.s[1]\n"
"fmla v19.4s, v21.4s, v9.s[2]\n"
"ldr q21, [%x[params], #0x180]\n"
- "fmla v12.4s, v20.4s, v7.s[0]\n"
"fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
"fmla v14.4s, v20.4s, v7.s[2]\n"
"fmla v15.4s, v20.4s, v7.s[3]\n"
"fmla v16.4s, v20.4s, v9.s[0]\n"
@@ -793,50 +793,50 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla v18.4s, v20.4s, v9.s[2]\n"
"fmla v19.4s, v20.4s, v9.s[3]\n"
"ldr q20, [%x[params], #0x190]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
"add %x[params], %x[params], #0x1a0\n"
"fmla v12.4s, v24.4s, v8.s[0]\n"
- "fmla v13.4s, v24.4s, v8.s[1]\n"
"fmla v14.4s, v24.4s, v8.s[2]\n"
"fmla v15.4s, v24.4s, v8.s[3]\n"
"fmla v16.4s, v24.4s, v10.s[0]\n"
"fmla v17.4s, v24.4s, v10.s[1]\n"
"fmla v18.4s, v24.4s, v10.s[2]\n"
"fmla v19.4s, v24.4s, v10.s[3]\n"
- "fmla v12.4s, v23.4s, v8.s[1]\n"
"fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
"fmla v14.4s, v23.4s, v8.s[3]\n"
"fmla v15.4s, v23.4s, v9.s[0]\n"
"fmla v16.4s, v23.4s, v10.s[1]\n"
"fmla v17.4s, v23.4s, v10.s[2]\n"
"fmla v18.4s, v23.4s, v10.s[3]\n"
"fmla v19.4s, v23.4s, v11.s[0]\n"
- "fmla v12.4s, v22.4s, v8.s[2]\n"
"fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
"fmla v14.4s, v22.4s, v9.s[0]\n"
"fmla v15.4s, v22.4s, v9.s[1]\n"
"fmla v16.4s, v22.4s, v10.s[2]\n"
"fmla v17.4s, v22.4s, v10.s[3]\n"
"fmla v18.4s, v22.4s, v11.s[0]\n"
"fmla v19.4s, v22.4s, v11.s[1]\n"
- "fmla v12.4s, v21.4s, v8.s[3]\n"
"fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
"fmla v14.4s, v21.4s, v9.s[1]\n"
"fmla v15.4s, v21.4s, v9.s[2]\n"
"fmla v16.4s, v21.4s, v10.s[3]\n"
"fmla v17.4s, v21.4s, v11.s[0]\n"
"fmla v18.4s, v21.4s, v11.s[1]\n"
"fmla v19.4s, v21.4s, v11.s[2]\n"
- "fmla v12.4s, v20.4s, v9.s[0]\n"
"fmla v13.4s, v20.4s, v9.s[1]\n"
- "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
"fmla v14.4s, v20.4s, v9.s[2]\n"
"fmla v15.4s, v20.4s, v9.s[3]\n"
- "fmin v13.4s, v13.4s, v25.4s\n"
"fmla v16.4s, v20.4s, v11.s[0]\n"
"fmla v17.4s, v20.4s, v11.s[1]\n"
- "fmin v14.4s, v14.4s, v25.4s\n"
"fmla v18.4s, v20.4s, v11.s[2]\n"
"fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
"fmin v15.4s, v15.4s, v25.4s\n"
"fmin v16.4s, v16.4s, v25.4s\n"
"fmin v17.4s, v17.4s, v25.4s\n"
@@ -851,35 +851,35 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmax v18.4s, v18.4s, v26.4s\n"
"fmax v19.4s, v19.4s, v26.4s\n"
"tbz %x[channel_multiplier], #1, 4f\n"
- "add x20, x12, x13\n"
- "add x21, x11, x13\n"
- "st1 { v12.d }[0], [x20]\n"
- "add x20, x10, x13\n"
- "add x24, x9, x13\n"
- "st1 { v13.d }[0], [x21]\n"
- "add x23, x28, x13\n"
- "add x22, x27, x13\n"
- "st1 { v14.d }[0], [x20]\n"
- "add x21, x26, x13\n"
- "add x20, x25, x13\n"
+ "add x21, x13, x14\n"
+ "add x20, x12, x14\n"
+ "add x25, x11, x14\n"
+ "add x24, x10, x14\n"
+ "st1 { v12.d }[0], [x21]\n"
+ "add x23, x9, x14\n"
+ "add x22, x28, x14\n"
+ "st1 { v13.d }[0], [x20]\n"
+ "add x21, x27, x14\n"
+ "add x20, x26, x14\n"
+ "st1 { v14.d }[0], [x25]\n"
"st1 { v15.d }[0], [x24]\n"
+ "add x14, x14, #0x8\n"
"st1 { v16.d }[0], [x23]\n"
- "add x13, x13, #0x8\n"
"st1 { v17.d }[0], [x22]\n"
"st1 { v18.d }[0], [x21]\n"
"st1 { v19.d }[0], [x20]\n"
"tbz %x[channel_multiplier], #0, 5f\n"
- "add x20, x12, x13\n"
- "add x21, x11, x13\n"
- "st1 { v12.s }[2], [x20]\n"
- "add x20, x10, x13\n"
- "add x24, x9, x13\n"
- "st1 { v13.s }[2], [x21]\n"
- "add x23, x28, x13\n"
- "add x22, x27, x13\n"
- "st1 { v14.s }[2], [x20]\n"
- "add x21, x26, x13\n"
- "add x20, x25, x13\n"
+ "add x21, x13, x14\n"
+ "add x20, x12, x14\n"
+ "add x25, x11, x14\n"
+ "add x24, x10, x14\n"
+ "st1 { v12.s }[2], [x21]\n"
+ "add x23, x9, x14\n"
+ "add x22, x28, x14\n"
+ "st1 { v13.s }[2], [x20]\n"
+ "add x21, x27, x14\n"
+ "add x20, x26, x14\n"
+ "st1 { v14.s }[2], [x25]\n"
"st1 { v15.s }[2], [x24]\n"
"st1 { v16.s }[2], [x23]\n"
"st1 { v17.s }[2], [x22]\n"
@@ -887,17 +887,17 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"st1 { v19.s }[2], [x20]\n"
"b 5f\n"
"4:" // Output channel oddments: Store: Bit 1: Unset
- "add x20, x12, x13\n"
- "add x21, x11, x13\n"
- "st1 { v12.s }[0], [x20]\n"
- "add x20, x10, x13\n"
- "add x24, x9, x13\n"
- "st1 { v13.s }[0], [x21]\n"
- "add x23, x28, x13\n"
- "add x22, x27, x13\n"
- "st1 { v14.s }[0], [x20]\n"
- "add x21, x26, x13\n"
- "add x20, x25, x13\n"
+ "add x21, x13, x14\n"
+ "add x20, x12, x14\n"
+ "add x25, x11, x14\n"
+ "add x24, x10, x14\n"
+ "st1 { v12.s }[0], [x21]\n"
+ "add x23, x9, x14\n"
+ "add x22, x28, x14\n"
+ "st1 { v13.s }[0], [x20]\n"
+ "add x21, x27, x14\n"
+ "add x20, x26, x14\n"
+ "st1 { v14.s }[0], [x25]\n"
"st1 { v15.s }[0], [x24]\n"
"st1 { v16.s }[0], [x23]\n"
"st1 { v17.s }[0], [x22]\n"
@@ -907,7 +907,7 @@ void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"6:" // End
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index cc18dd4bb4..83f3528286 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,22 +58,22 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"2:" // Output channel loop: Load bias: Done
"ldr q10, [%x[weights], #0x0]\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q1, [x20, #0x0]\n"
- "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
+ "ldp x21, x20, [x22], #0x10\n"
"mov v21.16b, v31.16b\n"
- "add %x[weights], %x[weights], #0x10\n"
"mov v22.16b, v31.16b\n"
"mov v23.16b, v31.16b\n"
"mov v24.16b, v31.16b\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v25.16b, v31.16b\n"
"mov v26.16b, v31.16b\n"
"mov v27.16b, v31.16b\n"
@@ -98,9 +98,9 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"subs x23, x23, #0x1\n"
"fmla v18.4s, v10.4s, v3.s[2]\n"
"fmla v19.4s, v10.4s, v3.s[3]\n"
- "ldr q3, [x21, #0x0]\n"
"fmla v20.4s, v10.4s, v2.s[0]\n"
"fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr q3, [x21, #0x0]\n"
"fmla v22.4s, v10.4s, v2.s[2]\n"
"fmla v23.4s, v10.4s, v2.s[3]\n"
"ldr q2, [x21, #0x10]\n"
@@ -120,9 +120,9 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
"ldr q7, [x21, #0x10]\n"
@@ -168,71 +168,71 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v31.4s, v10.4s, v0.s[3]\n"
"fmla v16.4s, v9.4s, v8.s[0]\n"
"fmla v17.4s, v9.4s, v8.s[1]\n"
- "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
- "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
- "fmin v19.4s, v19.4s, v11.4s\n"
"fmla v24.4s, v9.4s, v6.s[0]\n"
"fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v26.4s, v9.4s, v6.s[2]\n"
"fmla v27.4s, v9.4s, v6.s[3]\n"
- "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v28.4s, v9.4s, v5.s[0]\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v30.4s, v9.4s, v5.s[2]\n"
"fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
"fmin v23.4s, v23.4s, v11.4s\n"
"fmax v16.4s, v16.4s, v12.4s\n"
"fmax v17.4s, v17.4s, v12.4s\n"
- "str q16, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v18.4s, v18.4s, v12.4s\n"
"fmax v19.4s, v19.4s, v12.4s\n"
- "str q17, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v20.4s, v20.4s, v12.4s\n"
"fmax v21.4s, v21.4s, v12.4s\n"
- "str q18, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"fmax v22.4s, v22.4s, v12.4s\n"
"fmax v23.4s, v23.4s, v12.4s\n"
- "str q19, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.4s, v24.4s, v11.4s\n"
"fmin v25.4s, v25.4s, v11.4s\n"
- "str q20, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.4s, v26.4s, v11.4s\n"
"fmin v27.4s, v27.4s, v11.4s\n"
- "str q21, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.4s, v28.4s, v11.4s\n"
"fmin v29.4s, v29.4s, v11.4s\n"
- "str q22, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.4s, v30.4s, v11.4s\n"
"fmin v31.4s, v31.4s, v11.4s\n"
- "str q23, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.4s, v24.4s, v12.4s\n"
"fmax v25.4s, v25.4s, v12.4s\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.4s, v26.4s, v12.4s\n"
"fmax v27.4s, v27.4s, v12.4s\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.4s, v28.4s, v12.4s\n"
"fmax v29.4s, v29.4s, v12.4s\n"
- "str q26, [x25, x28]\n"
"fmax v30.4s, v30.4s, v12.4s\n"
"fmax v31.4s, v31.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -246,16 +246,16 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"lsl x28, x10, #0x2\n"
"fmla v18.4s, v10.4s, v3.s[2]\n"
"fmla v19.4s, v10.4s, v3.s[3]\n"
- "ldr q4, [x20, #0x0]\n"
"ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"fmla v20.4s, v10.4s, v2.s[0]\n"
"fmla v21.4s, v10.4s, v2.s[1]\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr q4, [x20, #0x0]\n"
"fmla v22.4s, v10.4s, v2.s[2]\n"
"fmla v23.4s, v10.4s, v2.s[3]\n"
"ldr q3, [x20, #0x10]\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
"fmla v24.4s, v10.4s, v1.s[0]\n"
"fmla v25.4s, v10.4s, v1.s[1]\n"
"ldr x23, [%x[outptrs], #0x20]\n"
@@ -290,71 +290,71 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v31.4s, v9.4s, v5.s[3]\n"
"fmla v16.4s, v1.4s, v4.s[0]\n"
"fmla v17.4s, v1.4s, v4.s[1]\n"
- "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v18.4s, v1.4s, v4.s[2]\n"
"fmla v19.4s, v1.4s, v4.s[3]\n"
- "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v20.4s, v1.4s, v3.s[0]\n"
"fmla v21.4s, v1.4s, v3.s[1]\n"
- "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v22.4s, v1.4s, v3.s[2]\n"
"fmla v23.4s, v1.4s, v3.s[3]\n"
- "fmin v19.4s, v19.4s, v11.4s\n"
"fmla v24.4s, v1.4s, v2.s[0]\n"
"fmla v25.4s, v1.4s, v2.s[1]\n"
- "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
"fmla v26.4s, v1.4s, v2.s[2]\n"
"fmla v27.4s, v1.4s, v2.s[3]\n"
- "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v28.4s, v1.4s, v0.s[0]\n"
"fmla v29.4s, v1.4s, v0.s[1]\n"
- "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v30.4s, v1.4s, v0.s[2]\n"
"fmla v31.4s, v1.4s, v0.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
"fmin v23.4s, v23.4s, v11.4s\n"
"fmax v16.4s, v16.4s, v12.4s\n"
"fmax v17.4s, v17.4s, v12.4s\n"
- "str q16, [x27, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v18.4s, v18.4s, v12.4s\n"
"fmax v19.4s, v19.4s, v12.4s\n"
- "str q17, [x26, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v20.4s, v20.4s, v12.4s\n"
"fmax v21.4s, v21.4s, v12.4s\n"
- "str q18, [x25, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"fmax v22.4s, v22.4s, v12.4s\n"
"fmax v23.4s, v23.4s, v12.4s\n"
- "str q19, [x24, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.4s, v24.4s, v11.4s\n"
"fmin v25.4s, v25.4s, v11.4s\n"
- "str q20, [x23, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.4s, v26.4s, v11.4s\n"
"fmin v27.4s, v27.4s, v11.4s\n"
- "str q21, [x22, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.4s, v28.4s, v11.4s\n"
"fmin v29.4s, v29.4s, v11.4s\n"
- "str q22, [x21, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.4s, v30.4s, v11.4s\n"
"fmin v31.4s, v31.4s, v11.4s\n"
- "str q23, [x20, x28]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.4s, v24.4s, v12.4s\n"
"fmax v25.4s, v25.4s, v12.4s\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.4s, v26.4s, v12.4s\n"
"fmax v27.4s, v27.4s, v12.4s\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.4s, v28.4s, v12.4s\n"
"fmax v29.4s, v29.4s, v12.4s\n"
- "str q26, [x25, x28]\n"
"fmax v30.4s, v30.4s, v12.4s\n"
"fmax v31.4s, v31.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -364,80 +364,80 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"6:" // Output channel loop: Single kernel point
"fmla v16.4s, v10.4s, v3.s[0]\n"
"fmla v17.4s, v10.4s, v3.s[1]\n"
- "fmin v16.4s, v16.4s, v11.4s\n"
"lsl x28, x10, #0x2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
"fmla v18.4s, v10.4s, v3.s[2]\n"
"fmla v19.4s, v10.4s, v3.s[3]\n"
- "fmin v17.4s, v17.4s, v11.4s\n"
- "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
"fmla v20.4s, v10.4s, v2.s[0]\n"
"fmla v21.4s, v10.4s, v2.s[1]\n"
- "fmin v18.4s, v18.4s, v11.4s\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
"fmla v22.4s, v10.4s, v2.s[2]\n"
"fmla v23.4s, v10.4s, v2.s[3]\n"
- "fmin v19.4s, v19.4s, v11.4s\n"
- "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
"fmla v24.4s, v10.4s, v1.s[0]\n"
"fmla v25.4s, v10.4s, v1.s[1]\n"
- "fmin v20.4s, v20.4s, v11.4s\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
"fmla v26.4s, v10.4s, v1.s[2]\n"
"fmla v27.4s, v10.4s, v1.s[3]\n"
- "fmin v21.4s, v21.4s, v11.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
"fmla v28.4s, v10.4s, v0.s[0]\n"
"fmla v29.4s, v10.4s, v0.s[1]\n"
- "fmin v22.4s, v22.4s, v11.4s\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
"fmla v30.4s, v10.4s, v0.s[2]\n"
"fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
"fmin v23.4s, v23.4s, v11.4s\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"fmax v16.4s, v16.4s, v12.4s\n"
"fmax v17.4s, v17.4s, v12.4s\n"
- "str q16, [x27, x28]\n"
"fmax v18.4s, v18.4s, v12.4s\n"
"fmax v19.4s, v19.4s, v12.4s\n"
- "str q17, [x26, x28]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"fmax v20.4s, v20.4s, v12.4s\n"
"fmax v21.4s, v21.4s, v12.4s\n"
- "str q18, [x25, x28]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"fmax v22.4s, v22.4s, v12.4s\n"
"fmax v23.4s, v23.4s, v12.4s\n"
- "str q19, [x24, x28]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"fmin v24.4s, v24.4s, v11.4s\n"
"fmin v25.4s, v25.4s, v11.4s\n"
- "str q20, [x23, x28]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"fmin v26.4s, v26.4s, v11.4s\n"
"fmin v27.4s, v27.4s, v11.4s\n"
- "str q21, [x22, x28]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"fmin v28.4s, v28.4s, v11.4s\n"
"fmin v29.4s, v29.4s, v11.4s\n"
- "str q22, [x21, x28]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"fmin v30.4s, v30.4s, v11.4s\n"
"fmin v31.4s, v31.4s, v11.4s\n"
- "str q23, [x20, x28]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"fmax v24.4s, v24.4s, v12.4s\n"
"fmax v25.4s, v25.4s, v12.4s\n"
- "str q24, [x27, x28]\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"fmax v26.4s, v26.4s, v12.4s\n"
"fmax v27.4s, v27.4s, v12.4s\n"
- "str q25, [x26, x28]\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"fmax v28.4s, v28.4s, v12.4s\n"
"fmax v29.4s, v29.4s, v12.4s\n"
- "str q26, [x25, x28]\n"
"fmax v30.4s, v30.4s, v12.4s\n"
"fmax v31.4s, v31.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "str q25, [x26, x28]\n"
+ "str q26, [x25, x28]\n"
"str q27, [x24, x28]\n"
"str q28, [x23, x28]\n"
"str q29, [x22, x28]\n"
@@ -464,22 +464,22 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"11:" // Output channel oddments: Load bias: Done
"ldr q10, [%x[weights], #0x0]\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
- "ldr q3, [x21, #0x0]\n"
- "ldr q2, [x21, #0x10]\n"
"mov v16.16b, v31.16b\n"
"mov v17.16b, v31.16b\n"
- "ldr q1, [x20, #0x0]\n"
- "ldr q0, [x20, #0x10]\n"
"mov v18.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
"mov v19.16b, v31.16b\n"
"mov v20.16b, v31.16b\n"
+ "ldp x21, x20, [x22], #0x10\n"
"mov v21.16b, v31.16b\n"
- "add %x[weights], %x[weights], #0x10\n"
"mov v22.16b, v31.16b\n"
"mov v23.16b, v31.16b\n"
"mov v24.16b, v31.16b\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"mov v25.16b, v31.16b\n"
"mov v26.16b, v31.16b\n"
"mov v27.16b, v31.16b\n"
@@ -504,9 +504,9 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"subs x23, x23, #0x1\n"
"fmla v18.4s, v10.4s, v3.s[2]\n"
"fmla v19.4s, v10.4s, v3.s[3]\n"
- "ldr q3, [x21, #0x0]\n"
"fmla v20.4s, v10.4s, v2.s[0]\n"
"fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr q3, [x21, #0x0]\n"
"fmla v22.4s, v10.4s, v2.s[2]\n"
"fmla v23.4s, v10.4s, v2.s[3]\n"
"ldr q2, [x21, #0x10]\n"
@@ -526,9 +526,9 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"fmla v17.4s, v9.4s, v8.s[1]\n"
"fmla v18.4s, v9.4s, v8.s[2]\n"
"fmla v19.4s, v9.4s, v8.s[3]\n"
- "ldr q8, [x21, #0x0]\n"
"fmla v20.4s, v9.4s, v7.s[0]\n"
"fmla v21.4s, v9.4s, v7.s[1]\n"
+ "ldr q8, [x21, #0x0]\n"
"fmla v22.4s, v9.4s, v7.s[2]\n"
"fmla v23.4s, v9.4s, v7.s[3]\n"
"ldr q7, [x21, #0x10]\n"
@@ -586,9 +586,9 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ldp x21, x20, [x22], #0x10\n"
"fmla v18.4s, v10.4s, v3.s[2]\n"
"fmla v19.4s, v10.4s, v3.s[3]\n"
- "ldr q4, [x21, #0x0]\n"
"fmla v20.4s, v10.4s, v2.s[0]\n"
"fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v22.4s, v10.4s, v2.s[2]\n"
"fmla v23.4s, v10.4s, v2.s[3]\n"
"ldr q3, [x21, #0x10]\n"
@@ -690,47 +690,47 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #1, 17f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #2\n"
- "add x26, x26, x10, LSL #2\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #2\n"
- "add x24, x24, x10, LSL #2\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #2\n"
- "add x22, x22, x10, LSL #2\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #2\n"
- "add x20, x20, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v16.d }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v17.d }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #2\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
"st1 { v18.d }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #2\n"
"st1 { v19.d }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
"st1 { v20.d }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
"st1 { v21.d }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
"st1 { v22.d }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v23.d }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #2\n"
- "add x10, x10, #0x2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v24.d }[0], [x27]\n"
+ "add x21, x21, x10, LSL #2\n"
"st1 { v25.d }[0], [x26]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "add x10, x10, #0x2\n"
"st1 { v26.d }[0], [x25]\n"
"st1 { v27.d }[0], [x24]\n"
"st1 { v28.d }[0], [x23]\n"
@@ -740,46 +740,46 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"tbz %x[n_output_channels], #0, 18f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #2\n"
- "add x26, x26, x10, LSL #2\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #2\n"
- "add x24, x24, x10, LSL #2\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #2\n"
- "add x22, x22, x10, LSL #2\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #2\n"
- "add x20, x20, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v16.s }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v17.s }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #2\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
"st1 { v18.s }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #2\n"
"st1 { v19.s }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
"st1 { v20.s }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
"st1 { v21.s }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
"st1 { v22.s }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v23.s }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v24.s }[2], [x27]\n"
+ "add x21, x21, x10, LSL #2\n"
"st1 { v25.s }[2], [x26]\n"
+ "add x20, x20, x10, LSL #2\n"
"st1 { v26.s }[2], [x25]\n"
"st1 { v27.s }[2], [x24]\n"
"st1 { v28.s }[2], [x23]\n"
@@ -790,46 +790,46 @@ void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"17:" // Output channel oddments: Done: Store: Bit 1: Unset
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x10, LSL #2\n"
- "add x26, x26, x10, LSL #2\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x10, LSL #2\n"
- "add x24, x24, x10, LSL #2\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x10, LSL #2\n"
- "add x22, x22, x10, LSL #2\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x10, LSL #2\n"
- "add x20, x20, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v16.s }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x10, LSL #2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v17.s }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x10, LSL #2\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
"st1 { v18.s }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x10, LSL #2\n"
"st1 { v19.s }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x10, LSL #2\n"
+ "add x27, x27, x10, LSL #2\n"
"st1 { v20.s }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
"st1 { v21.s }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x10, LSL #2\n"
+ "add x25, x25, x10, LSL #2\n"
"st1 { v22.s }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
"st1 { v23.s }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x10, LSL #2\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
"st1 { v24.s }[0], [x27]\n"
+ "add x21, x21, x10, LSL #2\n"
"st1 { v25.s }[0], [x26]\n"
+ "add x20, x20, x10, LSL #2\n"
"st1 { v26.s }[0], [x25]\n"
"st1 { v27.s }[0], [x24]\n"
"st1 { v28.s }[0], [x23]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 916c8a4afe..8af5e63a4b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,1622 +33,1622 @@ namespace depthwise {
void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x20, #0x1\n"
- "orr x20, x20, #0x100\n"
+ "mov x17, #0x1\n"
+ "lsr x16, %x[n_channels], #0x4\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "orr x20, x20, #0x10000\n"
- "lsr x11, %x[n_channels], #0x4\n"
- "dup v12.4s, w20\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_minval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "ld1r { v7.4s }, [x21]\n"
"ld1r { v11.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "mov x28, #0x0\n"
- "mov x27, #0x0\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "ldp x25, x24, [%x[outptrs], #0x0]\n"
- "ldp x23, x22, [%x[outptrs], #0x10]\n"
- "cbz x11, 3f\n"
- "ldr q15, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q30, [x13, x28]\n"
- "ldr q8, [x12, x28]\n"
- "zip2 v19.16b, v15.16b, v30.16b\n"
- "zip1 v15.16b, v15.16b, v30.16b\n"
- "ldr q26, [x10, x28]\n"
- "ldr q0, [x9, x28]\n"
- "zip1 v7.16b, v28.16b, v8.16b\n"
- "zip2 v8.16b, v28.16b, v8.16b\n"
- "ldr q29, [x26, x28]\n"
- "ldr q10, [x21, x28]\n"
- "zip2 v25.16b, v15.16b, v7.16b\n"
- "zip1 v15.16b, v15.16b, v7.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- "ldr q6, [%x[params], #0x20]\n"
- "zip1 v7.16b, v19.16b, v8.16b\n"
- "zip2 v8.16b, v19.16b, v8.16b\n"
- "ldr q31, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x30]\n"
- "zip2 v21.16b, v26.16b, v29.16b\n"
- "zip1 v26.16b, v26.16b, v29.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q22, [x21, x28]\n"
- "zip1 v27.16b, v0.16b, v10.16b\n"
- "zip2 v10.16b, v0.16b, v10.16b\n"
- "ldr q17, [x20, x28]\n"
- "ldp x21, x20, [%x[inptrs], #0x50]\n"
- "zip2 v23.16b, v26.16b, v27.16b\n"
- "zip1 v26.16b, v26.16b, v27.16b\n"
- "ldr q9, [x21, x28]\n"
- "ldr q5, [x20, x28]\n"
- "zip2 v28.16b, v22.16b, v9.16b\n"
- "zip1 v22.16b, v22.16b, v9.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q27, [x21, x28]\n"
- "zip1 v24.16b, v17.16b, v5.16b\n"
- "zip2 v5.16b, v17.16b, v5.16b\n"
- "ldr q18, [x20, x28]\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "orr x17, x17, #0x100\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "orr x17, x17, #0x10000\n"
+ "dup v15.4s, w17\n"
+ "cbz x16, 3f\n"
+ "ldr q13, [x15, x13]\n"
+ "ldr q5, [x14, x13]\n"
+ "subs x16, x16, #0x1\n"
+ "ldr q27, [x27, x13]\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr q1, [x25, x13]\n"
+ "ldr q28, [x24, x13]\n"
+ "ldr q26, [x23, x13]\n"
+ "ldr q4, [x22, x13]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x20]\n"
+ "zip2 v19.16b, v13.16b, v27.16b\n"
+ "zip1 v13.16b, v13.16b, v27.16b\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "zip1 v3.16b, v5.16b, v9.16b\n"
+ "zip2 v9.16b, v5.16b, v9.16b\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "zip2 v18.16b, v1.16b, v26.16b\n"
+ "zip1 v1.16b, v1.16b, v26.16b\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip1 v3.16b, v21.16b, v10.16b\n"
- "zip2 v10.16b, v21.16b, v10.16b\n"
- "ldr q4, [x21, x28]\n"
- "ldr q9, [x20, x28]\n"
- "zip2 v17.16b, v27.16b, v4.16b\n"
- "zip1 v27.16b, v27.16b, v4.16b\n"
- "zip1 v4.16b, v18.16b, v9.16b\n"
- "zip2 v9.16b, v18.16b, v9.16b\n"
+ "zip1 v16.16b, v28.16b, v4.16b\n"
+ "zip2 v4.16b, v28.16b, v4.16b\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr q14, [x26, x13]\n"
+ "zip2 v2.16b, v13.16b, v3.16b\n"
+ "zip1 v13.16b, v13.16b, v3.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "zip2 v19.16b, v22.16b, v24.16b\n"
- "zip1 v22.16b, v22.16b, v24.16b\n"
- "zip1 v0.16b, v28.16b, v5.16b\n"
- "zip2 v5.16b, v28.16b, v5.16b\n"
+ "ldr q3, [x25, x13]\n"
+ "ldr q6, [x24, x13]\n"
+ "zip1 v0.16b, v19.16b, v9.16b\n"
+ "zip2 v9.16b, v19.16b, v9.16b\n"
+ "ldr q5, [x23, x13]\n"
+ "ldr q20, [x22, x13]\n"
+ "zip2 v21.16b, v1.16b, v16.16b\n"
+ "zip1 v1.16b, v1.16b, v16.16b\n"
+ "ldr q16, [x21, x13]\n"
+ "ldr q25, [x20, x13]\n"
+ "zip1 v28.16b, v18.16b, v4.16b\n"
+ "zip2 v4.16b, v18.16b, v4.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v19.16b, v10.16b, v3.16b\n"
+ "zip1 v10.16b, v10.16b, v3.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "zip1 v18.16b, v14.16b, v6.16b\n"
+ "zip2 v6.16b, v14.16b, v6.16b\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "zip2 v23.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v24.16b, v27.16b, v4.16b\n"
- "zip1 v27.16b, v27.16b, v4.16b\n"
- "zip1 v2.16b, v17.16b, v9.16b\n"
- "zip2 v9.16b, v17.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
+ "zip1 v16.16b, v20.16b, v25.16b\n"
+ "zip2 v25.16b, v20.16b, v25.16b\n"
+ "zip2 v29.16b, v10.16b, v18.16b\n"
+ "zip1 v10.16b, v10.16b, v18.16b\n"
+ "zip1 v27.16b, v19.16b, v6.16b\n"
+ "zip2 v6.16b, v19.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v14.16b, v23.16b, v25.16b\n"
+ "zip2 v25.16b, v23.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v21.4s, #0x0\n"
- ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
- ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
- ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
- "movi v18.4s, #0x0\n"
- "subs x11, x11, #0x1\n"
- ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
- ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
- ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
- ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
- "mls v31.4s, v21.4s, v16.4s\n"
- ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
- ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
- "ldr q26, [%x[params], #0x10]\n"
- ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
- ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
- "ldr q17, [%x[params], #0x0]\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
- ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v21.4s, v16.4s\n"
- "and v15.16b, v31.16b, v26.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v17.4s\n"
- "sqrdmulh v29.4s, v29.4s, v17.4s\n"
- "sqrdmulh v28.4s, v28.4s, v17.4s\n"
- "ldr q1, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v15.4s\n"
- "and v18.16b, v30.16b, v26.16b\n"
- "and v21.16b, v29.16b, v26.16b\n"
- "and v17.16b, v28.16b, v26.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8d97df // sdot v31.4s, v30.16b, v13.16b\n"
+ ".inst 0x4e8197c3 // sdot v3.4s, v30.16b, v1.16b\n"
+ "add x13, x13, #0x10\n"
+ "movi v22.4s, #0x0\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0x4e8195f3 // sdot v19.4s, v15.16b, v1.16b\n"
+ ".inst 0x4e81951f // sdot v31.4s, v8.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8a9503 // sdot v3.4s, v8.16b, v10.16b\n"
+ ".inst 0x4e8a95f3 // sdot v19.4s, v15.16b, v10.16b\n"
+ ".inst 0x4e8195f6 // sdot v22.4s, v15.16b, v1.16b\n"
+ ".inst 0x4e8a963f // sdot v31.4s, v17.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e8197d7 // sdot v23.4s, v30.16b, v1.16b\n"
+ "mov v16.16b, v19.16b\n .inst 0x4e8595f0 // sdot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95f3 // sdot v19.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e859623 // sdot v3.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a95f6 // sdot v22.4s, v15.16b, v10.16b\n"
+ ".inst 0x4e8d97da // sdot v26.4s, v30.16b, v13.16b\n"
+ ".inst 0x4e8a9517 // sdot v23.4s, v8.16b, v10.16b\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v19.4s, #0x0\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x4e81951a // sdot v26.4s, v8.16b, v1.16b\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "mov v16.16b, v22.16b\n .inst 0x4e8595f0 // sdot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95f6 // sdot v22.4s, v15.16b, v13.16b\n"
+ "ldr q1, [%x[params], #0x0]\n"
+ ".inst 0x4e9595f3 // sdot v19.4s, v15.16b, v21.16b\n"
+ ".inst 0x4e859637 // sdot v23.4s, v17.16b, v5.16b\n"
+ ".inst 0x4e8a963a // sdot v26.4s, v17.16b, v10.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v1.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v8.16b\n"
+ ".inst 0x4e9d95f3 // sdot v19.4s, v15.16b, v29.16b\n"
+ "mls v26.4s, v22.4s, v24.4s\n"
+ "movi v20.4s, #0x0\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "and v30.16b, v3.16b, v8.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "ldr q10, [%x[params], #0x60]\n"
+ "mov v22.16b, v19.16b\n .inst 0x4e9295f6 // sdot v22.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295f3 // sdot v19.4s, v15.16b, v2.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v8.16b\n"
+ "and v16.16b, v26.16b, v8.16b\n"
+ "sqadd v3.4s, v3.4s, v30.4s\n"
+ "ldr q5, [%x[params], #0x50]\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v26.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "ldr q18, [%x[params], #0x40]\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v26.4s\n"
- "srshl v29.4s, v29.4s, v26.4s\n"
- "srshl v28.4s, v28.4s, v26.4s\n"
- "ldr q20, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v8.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "ldr q30, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "ldr q1, [%x[params], #0x70]\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x4e979596 // sdot v22.4s, v12.16b, v23.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q26, [%x[params], #0x20]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- ".inst 0x4e939596 // sdot v22.4s, v12.16b, v19.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- "mov v6.16b, v22.16b\n .inst 0x4e989586 // sdot v6.4s, v12.16b, v24.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s29, [x23, x27]\n"
- "mov v30.16b, v26.16b\n"
- ".inst 0x4e999596 // sdot v22.4s, v12.16b, v25.16b\n"
- "str s28, [x22, x27]\n"
- "mov v29.16b, v26.16b\n"
- "mov v21.16b, v26.16b\n"
- ".inst 0x4e9995fa // sdot v26.4s, v15.16b, v25.16b\n"
- ".inst 0x4e9795fd // sdot v29.4s, v15.16b, v23.16b\n"
- ".inst 0x4e97965a // sdot v26.4s, v18.16b, v23.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- "movi v28.4s, #0x0\n"
- ".inst 0x4e9995fe // sdot v30.4s, v15.16b, v25.16b\n"
- ".inst 0x4e9795f5 // sdot v21.4s, v15.16b, v23.16b\n"
- ".inst 0x4e97959c // sdot v28.4s, v12.16b, v23.16b\n"
- ".inst 0x4e93965d // sdot v29.4s, v18.16b, v19.16b\n"
- ".inst 0x4e93977a // sdot v26.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e97965e // sdot v30.4s, v18.16b, v23.16b\n"
- "ldr q4, [x9, x28]\n"
- ".inst 0x4e939655 // sdot v21.4s, v18.16b, v19.16b\n"
- "mls v26.4s, v22.4s, v16.4s\n"
- ".inst 0x4e93959c // sdot v28.4s, v12.16b, v19.16b\n"
- ".inst 0x4e98977d // sdot v29.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e93977e // sdot v30.4s, v27.16b, v19.16b\n"
- ".inst 0x4e989775 // sdot v21.4s, v27.16b, v24.16b\n"
- "sqrdmulh v26.4s, v26.4s, v1.4s\n"
- "mov v17.16b, v28.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x4e99959c // sdot v28.4s, v12.16b, v25.16b\n"
- "ldr q31, [x14, x28]\n"
- "mls v30.4s, v28.4s, v16.4s\n"
- "mls v29.4s, v6.4s, v16.4s\n"
- "mls v21.4s, v17.4s, v16.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v1.4s\n"
- "sqrdmulh v29.4s, v29.4s, v1.4s\n"
- "sqrdmulh v21.4s, v21.4s, v1.4s\n"
- "ldr q27, [%x[params], #0xc0]\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "and v18.16b, v30.16b, v20.16b\n"
- "and v6.16b, v29.16b, v20.16b\n"
- "and v17.16b, v21.16b, v20.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v29.4s, v29.4s, v6.4s\n"
- "ldr q24, [%x[params], #0xb0]\n"
- "sqadd v21.4s, v21.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x90]\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v21.4s, v21.4s, v20.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
"smin v26.4s, v26.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q31, [%x[params], #0x20]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x4e839596 // sdot v22.4s, v12.16b, v3.16b\n"
- ".inst 0x4e809596 // sdot v22.4s, v12.16b, v0.16b\n"
+ "str s3, [x9, x12]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s26, [x25, x27]\n"
- "ldr q26, [%x[params], #0x80]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "mov v18.16b, v22.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- ".inst 0x4e879596 // sdot v22.4s, v12.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s29, [x23, x27]\n"
- "mov v6.16b, v26.16b\n"
- "str s21, [x22, x27]\n"
- "mov v25.16b, v26.16b\n"
- "mov v20.16b, v26.16b\n"
- ".inst 0x4e8795fa // sdot v26.4s, v15.16b, v7.16b\n"
- ".inst 0x4e8395f9 // sdot v25.4s, v15.16b, v3.16b\n"
- ".inst 0x4e83979a // sdot v26.4s, v28.16b, v3.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x4e8795e6 // sdot v6.4s, v15.16b, v7.16b\n"
- ".inst 0x4e8395f4 // sdot v20.4s, v15.16b, v3.16b\n"
- ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
- ".inst 0x4e809799 // sdot v25.4s, v28.16b, v0.16b\n"
- ".inst 0x4e80971a // sdot v26.4s, v24.16b, v0.16b\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e839786 // sdot v6.4s, v28.16b, v3.16b\n"
- "ldr q19, [x26, x28]\n"
- ".inst 0x4e809794 // sdot v20.4s, v28.16b, v0.16b\n"
- "mls v26.4s, v22.4s, v16.4s\n"
- ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
- ".inst 0x4e829719 // sdot v25.4s, v24.16b, v2.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "mov v8.16b, v31.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v16.16b, v31.16b\n"
+ "str s23, [x28, x12]\n"
+ "mov v26.16b, v31.16b\n"
+ ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e959628 // sdot v8.4s, v17.16b, v21.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e809706 // sdot v6.4s, v24.16b, v0.16b\n"
- ".inst 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
- "sqrdmulh v26.4s, v26.4s, v27.4s\n"
- "mov v17.16b, v23.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
- "ldr q21, [x13, x28]\n"
- "mls v6.4s, v23.4s, v16.4s\n"
- "mls v25.4s, v18.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v26.16b, v1.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v6.4s, v6.4s, v27.4s\n"
- "sqrdmulh v25.4s, v25.4s, v27.4s\n"
- "sqrdmulh v20.4s, v20.4s, v27.4s\n"
- "ldr q15, [%x[params], #0x120]\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "and v18.16b, v6.16b, v1.16b\n"
- "and v22.16b, v25.16b, v1.16b\n"
- "and v17.16b, v20.16b, v1.16b\n"
+ ".inst 0x4e9597df // sdot v31.4s, v30.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e829630 // sdot v16.4s, v17.16b, v2.16b\n"
+ ".inst 0x4e95963a // sdot v26.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e9595f4 // sdot v20.4s, v15.16b, v21.16b\n"
+ ".inst 0x4e9d97c8 // sdot v8.4s, v30.16b, v29.16b\n"
+ ".inst 0x4e9d94bf // sdot v31.4s, v5.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9597d0 // sdot v16.4s, v30.16b, v21.16b\n"
+ "ldr q3, [x24, x13]\n"
+ ".inst 0x4e9d97da // sdot v26.4s, v30.16b, v29.16b\n"
+ ".inst 0x4e9d95f4 // sdot v20.4s, v15.16b, v29.16b\n"
+ ".inst 0x4e9294a8 // sdot v8.4s, v5.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e9d94b0 // sdot v16.4s, v5.16b, v29.16b\n"
+ ".inst 0x4e9294ba // sdot v26.4s, v5.16b, v18.16b\n"
+ "mov v17.16b, v20.16b\n .inst 0x4e9295f1 // sdot v17.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295f4 // sdot v20.4s, v15.16b, v2.16b\n"
+ "ldr q2, [x14, x13]\n"
+ ".inst 0x4e9c95f7 // sdot v23.4s, v15.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v10.4s\n"
+ "mls v8.4s, v22.4s, v24.4s\n"
+ "mls v26.4s, v17.4s, v24.4s\n"
+ "and v18.16b, v31.16b, v1.16b\n"
+ "mls v16.4s, v20.4s, v24.4s\n"
+ "movi v21.4s, #0x0\n"
+ "sqrdmulh v8.4s, v8.4s, v10.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v10.4s\n"
+ ".inst 0x4e9b95f7 // sdot v23.4s, v15.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqrdmulh v16.4s, v16.4s, v10.4s\n"
+ "ldr q13, [%x[params], #0xc0]\n"
+ "and v17.16b, v8.16b, v1.16b\n"
+ "sqadd v31.4s, v31.4s, v18.4s\n"
+ "and v20.16b, v26.16b, v1.16b\n"
+ "and v10.16b, v16.16b, v1.16b\n"
+ "mov v19.16b, v23.16b\n .inst 0x4e8e95f3 // sdot v19.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095f7 // sdot v23.4s, v15.16b, v0.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "ldr q30, [%x[params], #0xb0]\n"
+ "sqadd v16.4s, v16.4s, v10.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "srshl v8.4s, v8.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
"srshl v26.4s, v26.4s, v1.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "ldr q30, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "add v8.4s, v8.4s, v12.4s\n"
+ "add v16.4s, v16.4s, v12.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v8.4s, v8.4s, v7.4s\n"
+ "smax v16.4s, v16.4s, v7.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v8.4s, v8.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q10, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s16, [x10, x12]\n"
+ "mov v18.16b, v10.16b\n"
+ "str s8, [x9, x12]\n"
+ "mov v8.16b, v10.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v26.16b, v10.16b\n"
+ ".inst 0x4e80968a // sdot v10.4s, v20.16b, v0.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e9c9688 // sdot v8.4s, v20.16b, v28.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e9c962a // sdot v10.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e809692 // sdot v18.4s, v20.16b, v0.16b\n"
+ ".inst 0x4e9c969a // sdot v26.4s, v20.16b, v28.16b\n"
+ ".inst 0x4e9c95f5 // sdot v21.4s, v15.16b, v28.16b\n"
+ ".inst 0x4e9b9628 // sdot v8.4s, v17.16b, v27.16b\n"
+ ".inst 0x4e9b97ca // sdot v10.4s, v30.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9c9632 // sdot v18.4s, v17.16b, v28.16b\n"
+ "ldr q28, [x23, x13]\n"
+ ".inst 0x4e9b963a // sdot v26.4s, v17.16b, v27.16b\n"
+ ".inst 0x4e9b95f5 // sdot v21.4s, v15.16b, v27.16b\n"
+ ".inst 0x4e8e97c8 // sdot v8.4s, v30.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "mls v10.4s, v23.4s, v24.4s\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e9b97d2 // sdot v18.4s, v30.16b, v27.16b\n"
+ ".inst 0x4e8e97da // sdot v26.4s, v30.16b, v14.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x4e8e95f0 // sdot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095f5 // sdot v21.4s, v15.16b, v0.16b\n"
+ "ldr q29, [x27, x13]\n"
+ ".inst 0x4e8495e1 // sdot v1.4s, v15.16b, v4.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v13.4s\n"
+ "mls v8.4s, v19.4s, v24.4s\n"
+ "mls v26.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v10.16b, v22.16b\n"
+ "mls v18.4s, v21.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ "sqrdmulh v8.4s, v8.4s, v13.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v13.4s\n"
+ ".inst 0x4e8695e1 // sdot v1.4s, v15.16b, v6.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v18.4s, v18.4s, v13.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "and v17.16b, v8.16b, v22.16b\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v22.16b\n"
+ "and v16.16b, v18.16b, v22.16b\n"
+ "mov v19.16b, v1.16b\n .inst 0x4e9995f3 // sdot v19.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995e1 // sdot v1.4s, v15.16b, v9.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
"ldr q27, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q24, [%x[params], #0xf0]\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "srshl v6.4s, v6.4s, v1.4s\n"
- "srshl v25.4s, v25.4s, v1.4s\n"
- "srshl v20.4s, v20.4s, v1.4s\n"
- "ldr q23, [%x[params], #0x130]\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "add v6.4s, v6.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v10.4s, v10.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "srshl v8.4s, v8.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "smax v10.4s, v10.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "ldr q31, [%x[params], #0x130]\n"
+ "add v8.4s, v8.4s, v12.4s\n"
+ "add v18.4s, v18.4s, v12.4s\n"
+ "smin v10.4s, v10.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v8.4s, v8.4s, v7.4s\n"
+ "smax v18.4s, v18.4s, v7.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v8.4s, v8.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"smin v26.4s, v26.4s, v11.4s\n"
- "smax v6.4s, v6.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v6.4s, v6.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s10, [x11, x12]\n"
+ "ldr q0, [%x[params], #0xe0]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "movi v0.4s, #0x0\n"
- ".inst 0x4e8a9580 // sdot v0.4s, v12.16b, v10.16b\n"
- ".inst 0x4e859580 // sdot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s26, [x25, x27]\n"
- "ldr q28, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v22.16b, v0.16b\n .inst 0x4e899596 // sdot v22.4s, v12.16b, v9.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s6, [x24, x27]\n"
- ".inst 0x4e889580 // sdot v0.4s, v12.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x23, x27]\n"
- "mov v29.16b, v28.16b\n"
- "str s20, [x22, x27]\n"
- "mov v25.16b, v28.16b\n"
- "mov v7.16b, v28.16b\n"
- ".inst 0x4e88971c // sdot v28.4s, v24.16b, v8.16b\n"
- ".inst 0x4e8a9719 // sdot v25.4s, v24.16b, v10.16b\n"
- ".inst 0x4e8a97dc // sdot v28.4s, v30.16b, v10.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e88971d // sdot v29.4s, v24.16b, v8.16b\n"
- ".inst 0x4e8a9707 // sdot v7.4s, v24.16b, v10.16b\n"
- ".inst 0x4e8a9591 // sdot v17.4s, v12.16b, v10.16b\n"
- ".inst 0x4e8597d9 // sdot v25.4s, v30.16b, v5.16b\n"
- ".inst 0x4e85977c // sdot v28.4s, v27.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8a97dd // sdot v29.4s, v30.16b, v10.16b\n"
- "ldr q10, [x21, x28]\n"
- ".inst 0x4e8597c7 // sdot v7.4s, v30.16b, v5.16b\n"
- "mls v28.4s, v0.4s, v16.4s\n"
- ".inst 0x4e859591 // sdot v17.4s, v12.16b, v5.16b\n"
- ".inst 0x4e899779 // sdot v25.4s, v27.16b, v9.16b\n"
+ "str s18, [x10, x12]\n"
+ "mov v22.16b, v0.16b\n"
+ "str s8, [x9, x12]\n"
+ "mov v23.16b, v0.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v14.16b, v0.16b\n"
+ ".inst 0x4e899600 // sdot v0.4s, v16.16b, v9.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e849617 // sdot v23.4s, v16.16b, v4.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e85977d // sdot v29.4s, v27.16b, v5.16b\n"
- ".inst 0x4e899767 // sdot v7.4s, v27.16b, v9.16b\n"
- "sqrdmulh v28.4s, v28.4s, v15.4s\n"
- "mov v18.16b, v17.16b\n .inst 0x4e899592 // sdot v18.4s, v12.16b, v9.16b\n"
- ".inst 0x4e889591 // sdot v17.4s, v12.16b, v8.16b\n"
- "ldr q8, [x12, x28]\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mls v25.4s, v22.4s, v16.4s\n"
- "mls v7.4s, v18.4s, v16.4s\n"
- "and v17.16b, v28.16b, v23.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v29.4s, v29.4s, v15.4s\n"
- "sqrdmulh v25.4s, v25.4s, v15.4s\n"
- "sqrdmulh v7.4s, v7.4s, v15.4s\n"
- "ldr q15, [x15, x28]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q22, [x21, x28]\n"
- "ldr q3, [x20, x28]\n"
- "and v24.16b, v29.16b, v23.16b\n"
- "and v20.16b, v25.16b, v23.16b\n"
- "and v17.16b, v7.16b, v23.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
+ ".inst 0x4e849620 // sdot v0.4s, v17.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x4e899616 // sdot v22.4s, v16.16b, v9.16b\n"
+ ".inst 0x4e84960e // sdot v14.4s, v16.16b, v4.16b\n"
+ ".inst 0x4e8495e5 // sdot v5.4s, v15.16b, v4.16b\n"
+ ".inst 0x4e869637 // sdot v23.4s, v17.16b, v6.16b\n"
+ ".inst 0x4e869760 // sdot v0.4s, v27.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e849636 // sdot v22.4s, v17.16b, v4.16b\n"
+ "ldr q4, [x22, x13]\n"
+ ".inst 0x4e86962e // sdot v14.4s, v17.16b, v6.16b\n"
+ ".inst 0x4e8695e5 // sdot v5.4s, v15.16b, v6.16b\n"
+ ".inst 0x4e999777 // sdot v23.4s, v27.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "mls v0.4s, v1.4s, v24.4s\n"
+ ".inst 0x4e869776 // sdot v22.4s, v27.16b, v6.16b\n"
+ ".inst 0x4e99976e // sdot v14.4s, v27.16b, v25.16b\n"
+ "mov v17.16b, v5.16b\n .inst 0x4e9995f1 // sdot v17.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995e5 // sdot v5.4s, v15.16b, v9.16b\n"
+ "ldr q9, [x26, x13]\n"
+ "sqrdmulh v0.4s, v0.4s, v30.4s\n"
+ "mls v23.4s, v19.4s, v24.4s\n"
+ "and v16.16b, v0.16b, v31.16b\n"
+ "mls v22.4s, v5.4s, v24.4s\n"
+ "mls v14.4s, v17.4s, v24.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v30.4s\n"
+ "ldr q13, [x15, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
- "ldr q2, [x21, x28]\n"
- "ldr q5, [x20, x28]\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v28.4s, v28.4s, v23.4s\n"
- "sqadd v29.4s, v29.4s, v24.4s\n"
- "ldr q6, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v20.4s\n"
- "ldr q20, [%x[params], #0x170]\n"
- "sqadd v7.4s, v7.4s, v17.4s\n"
- "ldr q1, [%x[params], #0x150]\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "srshl v29.4s, v29.4s, v23.4s\n"
- "srshl v25.4s, v25.4s, v23.4s\n"
- "srshl v7.4s, v7.4s, v23.4s\n"
- "ldr q26, [x10, x28]\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q27, [x21, x28]\n"
- "ldr q30, [x20, x28]\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v7.4s, v7.4s, v14.4s\n"
+ "sqadd v0.4s, v0.4s, v16.4s\n"
+ "and v19.16b, v23.16b, v31.16b\n"
+ "ldr q10, [x23, x13]\n"
+ "ldr q26, [x22, x13]\n"
+ "and v21.16b, v22.16b, v31.16b\n"
+ "and v16.16b, v14.16b, v31.16b\n"
+ "ldr q20, [x21, x13]\n"
+ "ldr q6, [x20, x13]\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "srshl v0.4s, v0.4s, v31.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v19.4s\n"
+ "ldr q17, [%x[params], #0x170]\n"
+ "add v0.4s, v0.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v21.4s\n"
+ "ldr q8, [%x[params], #0x160]\n"
+ "sqadd v14.4s, v14.4s, v16.4s\n"
+ "ldr q30, [%x[params], #0x150]\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
+ "smax v0.4s, v0.4s, v7.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "srshl v14.4s, v14.4s, v31.4s\n"
+ "ldr q1, [x25, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "ldr q23, [x21, x28]\n"
- "ldr q9, [x20, x28]\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v7.4s, v7.4s, v13.4s\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "smin v7.4s, v7.4s, v11.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s28, [x25, x27]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "zip2 v17.16b, v15.16b, v21.16b\n"
- "zip1 v15.16b, v15.16b, v21.16b\n"
- "zip1 v18.16b, v31.16b, v8.16b\n"
- "zip2 v8.16b, v31.16b, v8.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s29, [x24, x27]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str s25, [x23, x27]\n"
- "zip2 v25.16b, v15.16b, v18.16b\n"
- "str s7, [x22, x27]\n"
- "zip1 v15.16b, v15.16b, v18.16b\n"
- "zip1 v7.16b, v17.16b, v8.16b\n"
- "add x27, x27, #0x4\n"
- "zip2 v8.16b, v17.16b, v8.16b\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "ldr q5, [x23, x13]\n"
+ "ldr q27, [x22, x13]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v14.4s, v14.4s, v12.4s\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldr q16, [x21, x13]\n"
+ "ldr q25, [x20, x13]\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "smax v22.4s, v22.4s, v7.4s\n"
+ "smax v14.4s, v14.4s, v7.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v14.4s, v14.4s, v11.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s0, [x11, x12]\n"
+ "zip2 v18.16b, v13.16b, v29.16b\n"
+ "zip1 v13.16b, v13.16b, v29.16b\n"
+ "zip1 v0.16b, v2.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "zip2 v9.16b, v2.16b, v9.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "zip2 v2.16b, v13.16b, v0.16b\n"
+ "zip1 v13.16b, v13.16b, v0.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str s23, [x9, x12]\n"
+ "zip1 v0.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldr q31, [%x[params], #0x140]\n"
- "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v26.16b, v26.16b, v19.16b\n"
- "zip1 v28.16b, v4.16b, v10.16b\n"
- "zip2 v10.16b, v4.16b, v10.16b\n"
- "zip2 v24.16b, v22.16b, v2.16b\n"
- "zip1 v22.16b, v22.16b, v2.16b\n"
- "zip1 v21.16b, v3.16b, v5.16b\n"
- "zip2 v5.16b, v3.16b, v5.16b\n"
- "zip2 v18.16b, v27.16b, v23.16b\n"
- "zip1 v27.16b, v27.16b, v23.16b\n"
- "zip1 v17.16b, v30.16b, v9.16b\n"
- "zip2 v9.16b, v30.16b, v9.16b\n"
- "zip2 v23.16b, v26.16b, v28.16b\n"
- "zip1 v26.16b, v26.16b, v28.16b\n"
- "zip1 v3.16b, v29.16b, v10.16b\n"
- "zip2 v10.16b, v29.16b, v10.16b\n"
- "zip2 v19.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v22.16b, v21.16b\n"
- "zip1 v0.16b, v24.16b, v5.16b\n"
- "zip2 v5.16b, v24.16b, v5.16b\n"
- "zip2 v24.16b, v27.16b, v17.16b\n"
- "zip1 v27.16b, v27.16b, v17.16b\n"
- "zip1 v2.16b, v18.16b, v9.16b\n"
- "zip2 v9.16b, v18.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
+ "zip2 v23.16b, v10.16b, v20.16b\n"
+ "zip1 v10.16b, v10.16b, v20.16b\n"
+ "str s22, [x10, x12]\n"
+ "str s14, [x28, x12]\n"
+ "zip2 v22.16b, v1.16b, v28.16b\n"
+ "zip1 v1.16b, v1.16b, v28.16b\n"
+ "add x12, x12, #0x4\n"
+ "zip1 v20.16b, v3.16b, v4.16b\n"
+ "zip2 v4.16b, v3.16b, v4.16b\n"
+ "zip1 v14.16b, v26.16b, v6.16b\n"
+ "zip2 v6.16b, v26.16b, v6.16b\n"
+ "zip2 v19.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v16.16b, v27.16b, v25.16b\n"
+ "zip2 v25.16b, v27.16b, v25.16b\n"
+ "zip2 v21.16b, v1.16b, v20.16b\n"
+ "zip1 v1.16b, v1.16b, v20.16b\n"
+ "zip1 v28.16b, v22.16b, v4.16b\n"
+ "zip2 v4.16b, v22.16b, v4.16b\n"
+ "zip2 v29.16b, v10.16b, v14.16b\n"
+ "zip1 v10.16b, v10.16b, v14.16b\n"
+ "zip1 v27.16b, v23.16b, v6.16b\n"
+ "zip2 v6.16b, v23.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v14.16b, v19.16b, v25.16b\n"
+ "zip2 v25.16b, v19.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v21.4s, #0x0\n"
- ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
- ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e8d97df // sdot v31.4s, v30.16b, v13.16b\n"
+ ".inst 0x4e8197c3 // sdot v3.4s, v30.16b, v1.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
- ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
- "movi v18.4s, #0x0\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
- ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
- ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
- ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
- "mls v31.4s, v21.4s, v16.4s\n"
- ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
- ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
- ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
- "ldr q17, [%x[params], #0x0]\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
- ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v21.4s, v16.4s\n"
- "and v27.16b, v31.16b, v4.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v17.4s\n"
- "sqrdmulh v29.4s, v29.4s, v17.4s\n"
- "sqrdmulh v28.4s, v28.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v27.4s\n"
- "and v20.16b, v30.16b, v4.16b\n"
- "and v18.16b, v29.16b, v4.16b\n"
- "and v17.16b, v28.16b, v4.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "movi v20.4s, #0x0\n"
+ "add x13, x13, #0x10\n"
+ ".inst 0x4e8195f3 // sdot v19.4s, v15.16b, v1.16b\n"
+ ".inst 0x4e81951f // sdot v31.4s, v8.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8a9503 // sdot v3.4s, v8.16b, v10.16b\n"
+ ".inst 0x4e8a95f3 // sdot v19.4s, v15.16b, v10.16b\n"
+ ".inst 0x4e8195f4 // sdot v20.4s, v15.16b, v1.16b\n"
+ ".inst 0x4e8a963f // sdot v31.4s, v17.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e8197d7 // sdot v23.4s, v30.16b, v1.16b\n"
+ "mov v16.16b, v19.16b\n .inst 0x4e8595f0 // sdot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95f3 // sdot v19.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e859623 // sdot v3.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a95f4 // sdot v20.4s, v15.16b, v10.16b\n"
+ ".inst 0x4e8d97da // sdot v26.4s, v30.16b, v13.16b\n"
+ ".inst 0x4e8a9517 // sdot v23.4s, v8.16b, v10.16b\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v30.4s, #0x0\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x4e81951a // sdot v26.4s, v8.16b, v1.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "mov v16.16b, v20.16b\n .inst 0x4e8595f0 // sdot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95f4 // sdot v20.4s, v15.16b, v13.16b\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ ".inst 0x4e9595fe // sdot v30.4s, v15.16b, v21.16b\n"
+ ".inst 0x4e859637 // sdot v23.4s, v17.16b, v5.16b\n"
+ ".inst 0x4e8a963a // sdot v26.4s, v17.16b, v10.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v8.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ ".inst 0x4e9d95fe // sdot v30.4s, v15.16b, v29.16b\n"
+ "mls v26.4s, v20.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "and v22.16b, v3.16b, v1.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "mov v19.16b, v30.16b\n .inst 0x4e9295f3 // sdot v19.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295fe // sdot v30.4s, v15.16b, v2.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v1.16b\n"
+ "and v16.16b, v26.16b, v1.16b\n"
+ "sqadd v3.4s, v3.4s, v22.4s\n"
+ "ldr q8, [%x[params], #0x50]\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "ldr q27, [%x[params], #0x40]\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "ldr q26, [%x[params], #0x50]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldr q6, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v29.4s, v29.4s, v4.4s\n"
- "srshl v28.4s, v28.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v1.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v1.4s, #0x0\n"
- ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s31, [x25, x27]\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s31, [x11, x12]\n"
"ldr q31, [%x[params], #0x20]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- "mov v22.16b, v1.16b\n .inst 0x4e989596 // sdot v22.4s, v12.16b, v24.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s29, [x23, x27]\n"
- "mov v29.16b, v31.16b\n"
- ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
- "str s28, [x22, x27]\n"
- "mov v21.16b, v31.16b\n"
- "mov v20.16b, v31.16b\n"
- ".inst 0x4e9994df // sdot v31.4s, v6.16b, v25.16b\n"
- ".inst 0x4e9794d5 // sdot v21.4s, v6.16b, v23.16b\n"
- ".inst 0x4e97977f // sdot v31.4s, v27.16b, v23.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e9994dd // sdot v29.4s, v6.16b, v25.16b\n"
- ".inst 0x4e9794d4 // sdot v20.4s, v6.16b, v23.16b\n"
- ".inst 0x4e979592 // sdot v18.4s, v12.16b, v23.16b\n"
- ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
- ".inst 0x4e93975f // sdot v31.4s, v26.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
- ".inst 0x4e939774 // sdot v20.4s, v27.16b, v19.16b\n"
- "mls v31.4s, v1.4s, v16.4s\n"
- ".inst 0x4e939592 // sdot v18.4s, v12.16b, v19.16b\n"
- ".inst 0x4e989755 // sdot v21.4s, v26.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
- ".inst 0x4e989754 // sdot v20.4s, v26.16b, v24.16b\n"
- "sqrdmulh v31.4s, v31.4s, v15.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x4e999592 // sdot v18.4s, v12.16b, v25.16b\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v21.4s, v22.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v4.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v29.4s, v29.4s, v15.4s\n"
- "sqrdmulh v21.4s, v21.4s, v15.4s\n"
- "sqrdmulh v20.4s, v20.4s, v15.4s\n"
- "ldr q27, [%x[params], #0xc0]\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v29.16b, v4.16b\n"
- "and v18.16b, v21.16b, v4.16b\n"
- "and v17.16b, v20.16b, v4.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s3, [x9, x12]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "mov v10.16b, v31.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v1.16b, v31.16b\n"
+ "str s23, [x28, x12]\n"
+ "mov v26.16b, v31.16b\n"
+ ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e95962a // sdot v10.4s, v17.16b, v21.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e829621 // sdot v1.4s, v17.16b, v2.16b\n"
+ ".inst 0x4e95963a // sdot v26.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e9595e5 // sdot v5.4s, v15.16b, v21.16b\n"
+ ".inst 0x4e9d960a // sdot v10.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9d951f // sdot v31.4s, v8.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e959601 // sdot v1.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9d95e5 // sdot v5.4s, v15.16b, v29.16b\n"
+ ".inst 0x4e92950a // sdot v10.4s, v8.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "mls v31.4s, v30.4s, v24.4s\n"
+ "movi v3.4s, #0x0\n"
+ ".inst 0x4e9d9501 // sdot v1.4s, v8.16b, v29.16b\n"
+ ".inst 0x4e92951a // sdot v26.4s, v8.16b, v18.16b\n"
+ "mov v16.16b, v5.16b\n .inst 0x4e9295f0 // sdot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295e5 // sdot v5.4s, v15.16b, v2.16b\n"
+ ".inst 0x4e9c95e3 // sdot v3.4s, v15.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mls v10.4s, v19.4s, v24.4s\n"
+ "mls v26.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v22.16b\n"
+ "mls v1.4s, v5.4s, v24.4s\n"
+ "movi v2.4s, #0x0\n"
+ "sqrdmulh v10.4s, v10.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ ".inst 0x4e9b95e3 // sdot v3.4s, v15.16b, v27.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "and v17.16b, v10.16b, v22.16b\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v22.16b\n"
+ "and v16.16b, v1.16b, v22.16b\n"
+ "mov v19.16b, v3.16b\n .inst 0x4e8e95f3 // sdot v19.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095e3 // sdot v3.4s, v15.16b, v0.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqadd v29.4s, v29.4s, v19.4s\n"
- "ldr q26, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v18.4s\n"
- "ldr q25, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q24, [%x[params], #0x90]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v29.4s, v29.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "ldr q18, [%x[params], #0xb0]\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v22.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "add v10.4s, v10.4s, v12.4s\n"
+ "add v1.4s, v1.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v10.4s, v10.4s, v7.4s\n"
+ "smax v1.4s, v1.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
- ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v10.4s, v10.4s, v11.4s\n"
+ "smin v1.4s, v1.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q31, [%x[params], #0x80]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v22.16b, v23.16b\n .inst 0x4e829596 // sdot v22.4s, v12.16b, v2.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s29, [x24, x27]\n"
- ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s21, [x23, x27]\n"
- "mov v21.16b, v31.16b\n"
- "str s20, [x22, x27]\n"
- "mov v4.16b, v31.16b\n"
- "mov v20.16b, v31.16b\n"
- ".inst 0x4e87971f // sdot v31.4s, v24.16b, v7.16b\n"
- ".inst 0x4e839704 // sdot v4.4s, v24.16b, v3.16b\n"
- ".inst 0x4e83975f // sdot v31.4s, v26.16b, v3.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e879715 // sdot v21.4s, v24.16b, v7.16b\n"
- ".inst 0x4e839714 // sdot v20.4s, v24.16b, v3.16b\n"
- ".inst 0x4e839592 // sdot v18.4s, v12.16b, v3.16b\n"
- ".inst 0x4e809744 // sdot v4.4s, v26.16b, v0.16b\n"
- ".inst 0x4e80973f // sdot v31.4s, v25.16b, v0.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s1, [x10, x12]\n"
+ "mov v30.16b, v21.16b\n"
+ "str s10, [x9, x12]\n"
+ "mov v20.16b, v21.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v29.16b, v21.16b\n"
+ ".inst 0x4e809615 // sdot v21.4s, v16.16b, v0.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
"ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e839755 // sdot v21.4s, v26.16b, v3.16b\n"
- ".inst 0x4e809754 // sdot v20.4s, v26.16b, v0.16b\n"
- "mls v31.4s, v23.4s, v16.4s\n"
- ".inst 0x4e809592 // sdot v18.4s, v12.16b, v0.16b\n"
- ".inst 0x4e829724 // sdot v4.4s, v25.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e809735 // sdot v21.4s, v25.16b, v0.16b\n"
- ".inst 0x4e829734 // sdot v20.4s, v25.16b, v2.16b\n"
- "sqrdmulh v31.4s, v31.4s, v27.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x4e879592 // sdot v18.4s, v12.16b, v7.16b\n"
- "mls v21.4s, v18.4s, v16.4s\n"
- "mls v4.4s, v22.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v1.16b\n"
+ ".inst 0x4e9c9635 // sdot v21.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e80961e // sdot v30.4s, v16.16b, v0.16b\n"
+ ".inst 0x4e9c961d // sdot v29.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c95e2 // sdot v2.4s, v15.16b, v28.16b\n"
+ ".inst 0x4e9b9634 // sdot v20.4s, v17.16b, v27.16b\n"
+ ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9c963e // sdot v30.4s, v17.16b, v28.16b\n"
+ ".inst 0x4e9b963d // sdot v29.4s, v17.16b, v27.16b\n"
+ ".inst 0x4e9b95e2 // sdot v2.4s, v15.16b, v27.16b\n"
+ ".inst 0x4e8e9654 // sdot v20.4s, v18.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "mls v21.4s, v3.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ ".inst 0x4e9b965e // sdot v30.4s, v18.16b, v27.16b\n"
+ ".inst 0x4e8e965d // sdot v29.4s, v18.16b, v14.16b\n"
+ "mov v16.16b, v2.16b\n .inst 0x4e8e95f0 // sdot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095e2 // sdot v2.4s, v15.16b, v0.16b\n"
+ ".inst 0x4e8495e5 // sdot v5.4s, v15.16b, v4.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "mls v20.4s, v19.4s, v24.4s\n"
+ "mls v29.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "mls v30.4s, v2.4s, v24.4s\n"
+ "movi v27.4s, #0x0\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v23.4s\n"
+ ".inst 0x4e8695e5 // sdot v5.4s, v15.16b, v6.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "ldr q26, [%x[params], #0x120]\n"
+ "and v17.16b, v20.16b, v22.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v19.16b, v29.16b, v22.16b\n"
+ "and v16.16b, v30.16b, v22.16b\n"
+ "mov v14.16b, v5.16b\n .inst 0x4e9995ee // sdot v14.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995e5 // sdot v5.4s, v15.16b, v9.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v27.4s\n"
- "sqrdmulh v4.4s, v4.4s, v27.4s\n"
- "sqrdmulh v20.4s, v20.4s, v27.4s\n"
- "ldr q30, [%x[params], #0x120]\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v21.16b, v1.16b\n"
- "and v18.16b, v4.16b, v1.16b\n"
- "and v17.16b, v20.16b, v1.16b\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "sqadd v21.4s, v21.4s, v19.4s\n"
- "ldr q29, [%x[params], #0x100]\n"
- "sqadd v4.4s, v4.4s, v18.4s\n"
- "ldr q28, [%x[params], #0x110]\n"
"sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q27, [%x[params], #0xf0]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v4.4s, v4.4s, v1.4s\n"
- "srshl v20.4s, v20.4s, v1.4s\n"
- "ldr q26, [%x[params], #0x130]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v4.4s, v4.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v4.4s, v4.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "ldr q18, [%x[params], #0x110]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "smax v21.4s, v21.4s, v7.4s\n"
+ "srshl v29.4s, v29.4s, v22.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
"smin v21.4s, v21.4s, v11.4s\n"
- "smin v4.4s, v4.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v7.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v29.4s, v29.4s, v7.4s\n"
"smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v25.4s, #0x0\n"
- ".inst 0x4e8a9599 // sdot v25.4s, v12.16b, v10.16b\n"
- ".inst 0x4e859599 // sdot v25.4s, v12.16b, v5.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q24, [%x[params], #0xe0]\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v23.16b, v25.16b\n .inst 0x4e899597 // sdot v23.4s, v12.16b, v9.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s21, [x11, x12]\n"
+ "ldr q22, [%x[params], #0xe0]\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s21, [x24, x27]\n"
- ".inst 0x4e889599 // sdot v25.4s, v12.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s4, [x23, x27]\n"
- "mov v22.16b, v24.16b\n"
- "str s20, [x22, x27]\n"
- "mov v21.16b, v24.16b\n"
- "mov v20.16b, v24.16b\n"
- ".inst 0x4e889778 // sdot v24.4s, v27.16b, v8.16b\n"
- ".inst 0x4e8a9775 // sdot v21.4s, v27.16b, v10.16b\n"
- ".inst 0x4e8a97b8 // sdot v24.4s, v29.16b, v10.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4e889776 // sdot v22.4s, v27.16b, v8.16b\n"
- ".inst 0x4e8a9774 // sdot v20.4s, v27.16b, v10.16b\n"
- ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
- ".inst 0x4e8597b5 // sdot v21.4s, v29.16b, v5.16b\n"
- ".inst 0x4e859798 // sdot v24.4s, v28.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8a97b6 // sdot v22.4s, v29.16b, v10.16b\n"
- ".inst 0x4e8597b4 // sdot v20.4s, v29.16b, v5.16b\n"
- "mls v24.4s, v25.4s, v16.4s\n"
- ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
- ".inst 0x4e899795 // sdot v21.4s, v28.16b, v9.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v21.16b, v22.16b\n"
+ "str s30, [x10, x12]\n"
+ "mov v20.16b, v22.16b\n"
+ "str s29, [x28, x12]\n"
+ "mov v19.16b, v22.16b\n"
+ ".inst 0x4e899616 // sdot v22.4s, v16.16b, v9.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e849615 // sdot v21.4s, v16.16b, v4.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e859796 // sdot v22.4s, v28.16b, v5.16b\n"
- ".inst 0x4e899794 // sdot v20.4s, v28.16b, v9.16b\n"
- "sqrdmulh v24.4s, v24.4s, v30.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
- ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
- "mls v22.4s, v18.4s, v16.4s\n"
- "mls v21.4s, v23.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v24.16b, v26.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqrdmulh v21.4s, v21.4s, v30.4s\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "and v19.16b, v22.16b, v26.16b\n"
- "and v18.16b, v21.16b, v26.16b\n"
- "and v17.16b, v20.16b, v26.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ ".inst 0x4e849636 // sdot v22.4s, v17.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x4e899614 // sdot v20.4s, v16.16b, v9.16b\n"
+ ".inst 0x4e849613 // sdot v19.4s, v16.16b, v4.16b\n"
+ ".inst 0x4e8495fb // sdot v27.4s, v15.16b, v4.16b\n"
+ ".inst 0x4e869635 // sdot v21.4s, v17.16b, v6.16b\n"
+ ".inst 0x4e869656 // sdot v22.4s, v18.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e849634 // sdot v20.4s, v17.16b, v4.16b\n"
+ ".inst 0x4e869633 // sdot v19.4s, v17.16b, v6.16b\n"
+ ".inst 0x4e8695fb // sdot v27.4s, v15.16b, v6.16b\n"
+ ".inst 0x4e999655 // sdot v21.4s, v18.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "mls v22.4s, v5.4s, v24.4s\n"
+ ".inst 0x4e869654 // sdot v20.4s, v18.16b, v6.16b\n"
+ ".inst 0x4e999653 // sdot v19.4s, v18.16b, v25.16b\n"
+ "mov v17.16b, v27.16b\n .inst 0x4e9995f1 // sdot v17.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995fb // sdot v27.4s, v15.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v26.4s\n"
+ "mls v21.4s, v14.4s, v24.4s\n"
+ "and v16.16b, v22.16b, v23.16b\n"
+ "mls v20.4s, v27.4s, v24.4s\n"
+ "mls v19.4s, v17.4s, v24.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v18.16b, v21.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v21.4s, v21.4s, v18.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
"sqadd v20.4s, v20.4s, v17.4s\n"
- "srshl v24.4s, v24.4s, v26.4s\n"
- "srshl v22.4s, v22.4s, v26.4s\n"
- "srshl v21.4s, v21.4s, v26.4s\n"
- "srshl v20.4s, v20.4s, v26.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "smax v22.4s, v22.4s, v7.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
"smin v22.4s, v22.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v12.4s\n"
+ "smax v21.4s, v21.4s, v7.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v20.4s, v20.4s, v7.4s\n"
+ "smax v19.4s, v19.4s, v7.4s\n"
"smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x11, x12]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x25, x27]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s22, [x24, x27]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s21, [x23, x27]\n"
- "str s20, [x22, x27]\n"
- "add x27, x27, #0x4\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x10, x12]\n"
+ "str s21, [x9, x12]\n"
+ "str s19, [x28, x12]\n"
+ "add x12, x12, #0x4\n"
"beq 35f\n"
"3:" // Oddments
"and x20, %x[n_channels], #0xf\n"
- "add x15, x15, x28\n"
- "add x14, x14, x28\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
- "add x10, x10, x28\n"
- "add x9, x9, x28\n"
- "add x26, x26, x28\n"
- "add x21, x21, x28\n"
+ "add x15, x15, x13\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d15, [x15], #0x8\n"
- "ldr d25, [x14], #0x8\n"
- "ldr d7, [x13], #0x8\n"
- "ldr d8, [x12], #0x8\n"
- "ldr d26, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d3, [x26], #0x8\n"
- "ldr d10, [x21], #0x8\n"
+ "ldr d13, [x15], #0x8\n"
+ "ldr d2, [x14], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d9, [x26], #0x8\n"
+ "ldr d1, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v15.s }[2], [x15], #0x4\n"
- "ld1 { v25.s }[2], [x14], #0x4\n"
- "ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v8.s }[2], [x12], #0x4\n"
- "ld1 { v26.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v3.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x15], #0x4\n"
+ "ld1 { v2.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v15.h }[6], [x15], #0x2\n"
- "ld1 { v25.h }[6], [x14], #0x2\n"
- "ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v8.h }[6], [x12], #0x2\n"
- "ld1 { v26.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v3.h }[6], [x26], #0x2\n"
- "ld1 { v10.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x15], #0x2\n"
+ "ld1 { v2.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x27], #0x2\n"
+ "ld1 { v9.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v28.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[14], [x15], #0x1\n"
- "ld1 { v25.b }[14], [x14], #0x1\n"
- "ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v8.b }[14], [x12], #0x1\n"
- "ld1 { v26.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v3.b }[14], [x26], #0x1\n"
- "ld1 { v10.b }[14], [x21], #0x1\n"
+ "ld1 { v13.b }[14], [x15], #0x1\n"
+ "ld1 { v2.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x27], #0x1\n"
+ "ld1 { v9.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v21.b }[14], [x24], #0x1\n"
+ "ld1 { v28.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[12], [x15], #0x1\n"
- "ld1 { v25.b }[12], [x14], #0x1\n"
- "ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v8.b }[12], [x12], #0x1\n"
- "ld1 { v26.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v3.b }[12], [x26], #0x1\n"
- "ld1 { v10.b }[12], [x21], #0x1\n"
+ "ld1 { v13.b }[12], [x15], #0x1\n"
+ "ld1 { v2.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x27], #0x1\n"
+ "ld1 { v9.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v21.b }[12], [x24], #0x1\n"
+ "ld1 { v28.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v15.h }[4], [x15], #0x2\n"
- "ld1 { v25.h }[4], [x14], #0x2\n"
- "ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v8.h }[4], [x12], #0x2\n"
- "ld1 { v26.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v3.h }[4], [x26], #0x2\n"
- "ld1 { v10.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x15], #0x2\n"
+ "ld1 { v2.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x27], #0x2\n"
+ "ld1 { v9.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v28.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[10], [x15], #0x1\n"
- "ld1 { v25.b }[10], [x14], #0x1\n"
- "ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v8.b }[10], [x12], #0x1\n"
- "ld1 { v26.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v3.b }[10], [x26], #0x1\n"
- "ld1 { v10.b }[10], [x21], #0x1\n"
+ "ld1 { v13.b }[10], [x15], #0x1\n"
+ "ld1 { v2.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x27], #0x1\n"
+ "ld1 { v9.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v21.b }[10], [x24], #0x1\n"
+ "ld1 { v28.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[8], [x15], #0x1\n"
- "ld1 { v25.b }[8], [x14], #0x1\n"
- "ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v8.b }[8], [x12], #0x1\n"
- "ld1 { v26.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v3.b }[8], [x26], #0x1\n"
- "ld1 { v10.b }[8], [x21], #0x1\n"
+ "ld1 { v13.b }[8], [x15], #0x1\n"
+ "ld1 { v2.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x27], #0x1\n"
+ "ld1 { v9.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v21.b }[8], [x24], #0x1\n"
+ "ld1 { v28.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s15, [x15], #0x4\n"
- "ldr s25, [x14], #0x4\n"
- "ldr s7, [x13], #0x4\n"
- "ldr s8, [x12], #0x4\n"
- "ldr s26, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s3, [x26], #0x4\n"
- "ldr s10, [x21], #0x4\n"
+ "ldr s13, [x15], #0x4\n"
+ "ldr s2, [x14], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s9, [x26], #0x4\n"
+ "ldr s1, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v15.h }[2], [x15], #0x2\n"
- "ld1 { v25.h }[2], [x14], #0x2\n"
- "ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v8.h }[2], [x12], #0x2\n"
- "ld1 { v26.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v3.h }[2], [x26], #0x2\n"
- "ld1 { v10.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[6], [x15], #0x1\n"
- "ld1 { v25.b }[6], [x14], #0x1\n"
- "ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v8.b }[6], [x12], #0x1\n"
- "ld1 { v26.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v3.b }[6], [x26], #0x1\n"
- "ld1 { v10.b }[6], [x21], #0x1\n"
+ "ld1 { v13.b }[6], [x15], #0x1\n"
+ "ld1 { v2.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x27], #0x1\n"
+ "ld1 { v9.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v21.b }[6], [x24], #0x1\n"
+ "ld1 { v28.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[4], [x15], #0x1\n"
- "ld1 { v25.b }[4], [x14], #0x1\n"
- "ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v8.b }[4], [x12], #0x1\n"
- "ld1 { v26.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v3.b }[4], [x26], #0x1\n"
- "ld1 { v10.b }[4], [x21], #0x1\n"
+ "ld1 { v13.b }[4], [x15], #0x1\n"
+ "ld1 { v2.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x27], #0x1\n"
+ "ld1 { v9.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v21.b }[4], [x24], #0x1\n"
+ "ld1 { v28.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h15, [x15], #0x2\n"
- "ldr h25, [x14], #0x2\n"
- "ldr h7, [x13], #0x2\n"
- "ldr h8, [x12], #0x2\n"
- "ldr h26, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h3, [x26], #0x2\n"
- "ldr h10, [x21], #0x2\n"
+ "ldr h13, [x15], #0x2\n"
+ "ldr h2, [x14], #0x2\n"
+ "ldr h0, [x27], #0x2\n"
+ "ldr h9, [x26], #0x2\n"
+ "ldr h1, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h28, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[2], [x15], #0x1\n"
- "ld1 { v25.b }[2], [x14], #0x1\n"
- "ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v8.b }[2], [x12], #0x1\n"
- "ld1 { v26.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x26], #0x1\n"
- "ld1 { v10.b }[2], [x21], #0x1\n"
+ "ld1 { v13.b }[2], [x15], #0x1\n"
+ "ld1 { v2.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x27], #0x1\n"
+ "ld1 { v9.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v21.b }[2], [x24], #0x1\n"
+ "ld1 { v28.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b15, [x15], #0x1\n"
- "ldr b25, [x14], #0x1\n"
- "ldr b7, [x13], #0x1\n"
- "ldr b8, [x12], #0x1\n"
- "ldr b26, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b3, [x26], #0x1\n"
- "ldr b10, [x21], #0x1\n"
+ "ldr b13, [x15], #0x1\n"
+ "ldr b2, [x14], #0x1\n"
+ "ldr b0, [x27], #0x1\n"
+ "ldr b9, [x26], #0x1\n"
+ "ldr b1, [x25], #0x1\n"
+ "ldr b21, [x24], #0x1\n"
+ "ldr b28, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "add x15, x15, x28\n"
- "add x14, x14, x28\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x21, [%x[inptrs], #0x70]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
- "add x10, x10, x28\n"
- "add x9, x9, x28\n"
- "add x26, x26, x28\n"
- "add x21, x21, x28\n"
+ "ldp x27, x26, [%x[inptrs], #0x50]\n"
+ "ldp x25, x24, [%x[inptrs], #0x60]\n"
+ "ldp x23, x22, [%x[inptrs], #0x70]\n"
+ "add x15, x15, x13\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d22, [x15], #0x8\n"
- "ldr d19, [x14], #0x8\n"
- "ldr d0, [x13], #0x8\n"
- "ldr d5, [x12], #0x8\n"
- "ldr d27, [x10], #0x8\n"
- "ldr d24, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d9, [x21], #0x8\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d29, [x14], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d6, [x26], #0x8\n"
+ "ldr d5, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v22.s }[2], [x15], #0x4\n"
- "ld1 { v19.s }[2], [x14], #0x4\n"
- "ld1 { v0.s }[2], [x13], #0x4\n"
- "ld1 { v5.s }[2], [x12], #0x4\n"
- "ld1 { v27.s }[2], [x10], #0x4\n"
- "ld1 { v24.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v9.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "ld1 { v29.s }[2], [x14], #0x4\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v6.s }[2], [x26], #0x4\n"
+ "ld1 { v5.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v14.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v22.h }[6], [x15], #0x2\n"
- "ld1 { v19.h }[6], [x14], #0x2\n"
- "ld1 { v0.h }[6], [x13], #0x2\n"
- "ld1 { v5.h }[6], [x12], #0x2\n"
- "ld1 { v27.h }[6], [x10], #0x2\n"
- "ld1 { v24.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v9.h }[6], [x21], #0x2\n"
+ "ld1 { v10.h }[6], [x15], #0x2\n"
+ "ld1 { v29.h }[6], [x14], #0x2\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
+ "ld1 { v6.h }[6], [x26], #0x2\n"
+ "ld1 { v5.h }[6], [x25], #0x2\n"
+ "ld1 { v18.h }[6], [x24], #0x2\n"
+ "ld1 { v14.h }[6], [x23], #0x2\n"
+ "ld1 { v25.h }[6], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[14], [x15], #0x1\n"
- "ld1 { v19.b }[14], [x14], #0x1\n"
- "ld1 { v0.b }[14], [x13], #0x1\n"
- "ld1 { v5.b }[14], [x12], #0x1\n"
- "ld1 { v27.b }[14], [x10], #0x1\n"
- "ld1 { v24.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v9.b }[14], [x21], #0x1\n"
+ "ld1 { v10.b }[14], [x15], #0x1\n"
+ "ld1 { v29.b }[14], [x14], #0x1\n"
+ "ld1 { v27.b }[14], [x27], #0x1\n"
+ "ld1 { v6.b }[14], [x26], #0x1\n"
+ "ld1 { v5.b }[14], [x25], #0x1\n"
+ "ld1 { v18.b }[14], [x24], #0x1\n"
+ "ld1 { v14.b }[14], [x23], #0x1\n"
+ "ld1 { v25.b }[14], [x22], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[12], [x15], #0x1\n"
- "ld1 { v19.b }[12], [x14], #0x1\n"
- "ld1 { v0.b }[12], [x13], #0x1\n"
- "ld1 { v5.b }[12], [x12], #0x1\n"
- "ld1 { v27.b }[12], [x10], #0x1\n"
- "ld1 { v24.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v9.b }[12], [x21], #0x1\n"
+ "ld1 { v10.b }[12], [x15], #0x1\n"
+ "ld1 { v29.b }[12], [x14], #0x1\n"
+ "ld1 { v27.b }[12], [x27], #0x1\n"
+ "ld1 { v6.b }[12], [x26], #0x1\n"
+ "ld1 { v5.b }[12], [x25], #0x1\n"
+ "ld1 { v18.b }[12], [x24], #0x1\n"
+ "ld1 { v14.b }[12], [x23], #0x1\n"
+ "ld1 { v25.b }[12], [x22], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v22.h }[4], [x15], #0x2\n"
- "ld1 { v19.h }[4], [x14], #0x2\n"
- "ld1 { v0.h }[4], [x13], #0x2\n"
- "ld1 { v5.h }[4], [x12], #0x2\n"
- "ld1 { v27.h }[4], [x10], #0x2\n"
- "ld1 { v24.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v9.h }[4], [x21], #0x2\n"
+ "ld1 { v10.h }[4], [x15], #0x2\n"
+ "ld1 { v29.h }[4], [x14], #0x2\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
+ "ld1 { v6.h }[4], [x26], #0x2\n"
+ "ld1 { v5.h }[4], [x25], #0x2\n"
+ "ld1 { v18.h }[4], [x24], #0x2\n"
+ "ld1 { v14.h }[4], [x23], #0x2\n"
+ "ld1 { v25.h }[4], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[10], [x15], #0x1\n"
- "ld1 { v19.b }[10], [x14], #0x1\n"
- "ld1 { v0.b }[10], [x13], #0x1\n"
- "ld1 { v5.b }[10], [x12], #0x1\n"
- "ld1 { v27.b }[10], [x10], #0x1\n"
- "ld1 { v24.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v9.b }[10], [x21], #0x1\n"
+ "ld1 { v10.b }[10], [x15], #0x1\n"
+ "ld1 { v29.b }[10], [x14], #0x1\n"
+ "ld1 { v27.b }[10], [x27], #0x1\n"
+ "ld1 { v6.b }[10], [x26], #0x1\n"
+ "ld1 { v5.b }[10], [x25], #0x1\n"
+ "ld1 { v18.b }[10], [x24], #0x1\n"
+ "ld1 { v14.b }[10], [x23], #0x1\n"
+ "ld1 { v25.b }[10], [x22], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[8], [x15], #0x1\n"
- "ld1 { v19.b }[8], [x14], #0x1\n"
- "ld1 { v0.b }[8], [x13], #0x1\n"
- "ld1 { v5.b }[8], [x12], #0x1\n"
- "ld1 { v27.b }[8], [x10], #0x1\n"
- "ld1 { v24.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v9.b }[8], [x21], #0x1\n"
+ "ld1 { v10.b }[8], [x15], #0x1\n"
+ "ld1 { v29.b }[8], [x14], #0x1\n"
+ "ld1 { v27.b }[8], [x27], #0x1\n"
+ "ld1 { v6.b }[8], [x26], #0x1\n"
+ "ld1 { v5.b }[8], [x25], #0x1\n"
+ "ld1 { v18.b }[8], [x24], #0x1\n"
+ "ld1 { v14.b }[8], [x23], #0x1\n"
+ "ld1 { v25.b }[8], [x22], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s22, [x15], #0x4\n"
- "ldr s19, [x14], #0x4\n"
- "ldr s0, [x13], #0x4\n"
- "ldr s5, [x12], #0x4\n"
- "ldr s27, [x10], #0x4\n"
- "ldr s24, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s9, [x21], #0x4\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s29, [x14], #0x4\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s6, [x26], #0x4\n"
+ "ldr s5, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s14, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v22.h }[2], [x15], #0x2\n"
- "ld1 { v19.h }[2], [x14], #0x2\n"
- "ld1 { v0.h }[2], [x13], #0x2\n"
- "ld1 { v5.h }[2], [x12], #0x2\n"
- "ld1 { v27.h }[2], [x10], #0x2\n"
- "ld1 { v24.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x15], #0x2\n"
+ "ld1 { v29.h }[2], [x14], #0x2\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
+ "ld1 { v6.h }[2], [x26], #0x2\n"
+ "ld1 { v5.h }[2], [x25], #0x2\n"
+ "ld1 { v18.h }[2], [x24], #0x2\n"
+ "ld1 { v14.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[6], [x15], #0x1\n"
- "ld1 { v19.b }[6], [x14], #0x1\n"
- "ld1 { v0.b }[6], [x13], #0x1\n"
- "ld1 { v5.b }[6], [x12], #0x1\n"
- "ld1 { v27.b }[6], [x10], #0x1\n"
- "ld1 { v24.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v9.b }[6], [x21], #0x1\n"
+ "ld1 { v10.b }[6], [x15], #0x1\n"
+ "ld1 { v29.b }[6], [x14], #0x1\n"
+ "ld1 { v27.b }[6], [x27], #0x1\n"
+ "ld1 { v6.b }[6], [x26], #0x1\n"
+ "ld1 { v5.b }[6], [x25], #0x1\n"
+ "ld1 { v18.b }[6], [x24], #0x1\n"
+ "ld1 { v14.b }[6], [x23], #0x1\n"
+ "ld1 { v25.b }[6], [x22], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[4], [x15], #0x1\n"
- "ld1 { v19.b }[4], [x14], #0x1\n"
- "ld1 { v0.b }[4], [x13], #0x1\n"
- "ld1 { v5.b }[4], [x12], #0x1\n"
- "ld1 { v27.b }[4], [x10], #0x1\n"
- "ld1 { v24.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v9.b }[4], [x21], #0x1\n"
+ "ld1 { v10.b }[4], [x15], #0x1\n"
+ "ld1 { v29.b }[4], [x14], #0x1\n"
+ "ld1 { v27.b }[4], [x27], #0x1\n"
+ "ld1 { v6.b }[4], [x26], #0x1\n"
+ "ld1 { v5.b }[4], [x25], #0x1\n"
+ "ld1 { v18.b }[4], [x24], #0x1\n"
+ "ld1 { v14.b }[4], [x23], #0x1\n"
+ "ld1 { v25.b }[4], [x22], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h22, [x15], #0x2\n"
- "ldr h19, [x14], #0x2\n"
- "ldr h0, [x13], #0x2\n"
- "ldr h5, [x12], #0x2\n"
- "ldr h27, [x10], #0x2\n"
- "ldr h24, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h9, [x21], #0x2\n"
+ "ldr h10, [x15], #0x2\n"
+ "ldr h29, [x14], #0x2\n"
+ "ldr h27, [x27], #0x2\n"
+ "ldr h6, [x26], #0x2\n"
+ "ldr h5, [x25], #0x2\n"
+ "ldr h18, [x24], #0x2\n"
+ "ldr h14, [x23], #0x2\n"
+ "ldr h25, [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[2], [x15], #0x1\n"
- "ld1 { v19.b }[2], [x14], #0x1\n"
- "ld1 { v0.b }[2], [x13], #0x1\n"
- "ld1 { v5.b }[2], [x12], #0x1\n"
- "ld1 { v27.b }[2], [x10], #0x1\n"
- "ld1 { v24.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v9.b }[2], [x21], #0x1\n"
+ "ld1 { v10.b }[2], [x15], #0x1\n"
+ "ld1 { v29.b }[2], [x14], #0x1\n"
+ "ld1 { v27.b }[2], [x27], #0x1\n"
+ "ld1 { v6.b }[2], [x26], #0x1\n"
+ "ld1 { v5.b }[2], [x25], #0x1\n"
+ "ld1 { v18.b }[2], [x24], #0x1\n"
+ "ld1 { v14.b }[2], [x23], #0x1\n"
+ "ld1 { v25.b }[2], [x22], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b22, [x15], #0x1\n"
- "ldr b19, [x14], #0x1\n"
- "ldr b0, [x13], #0x1\n"
- "ldr b5, [x12], #0x1\n"
- "ldr b27, [x10], #0x1\n"
- "ldr b24, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b9, [x21], #0x1\n"
+ "ldr b10, [x15], #0x1\n"
+ "ldr b29, [x14], #0x1\n"
+ "ldr b27, [x27], #0x1\n"
+ "ldr b6, [x26], #0x1\n"
+ "ldr b5, [x25], #0x1\n"
+ "ldr b18, [x24], #0x1\n"
+ "ldr b14, [x23], #0x1\n"
+ "ldr b25, [x22], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
"ldr q20, [%x[params], #0x10]\n"
- "ldr q6, [%x[params], #0x20]\n"
- "zip2 v1.16b, v26.16b, v3.16b\n"
- "zip1 v26.16b, v26.16b, v3.16b\n"
- "ldr q4, [%x[params], #0x30]\n"
- "zip1 v18.16b, v23.16b, v10.16b\n"
- "zip2 v30.16b, v15.16b, v7.16b\n"
+ "ldr q17, [%x[params], #0x20]\n"
+ "zip2 v26.16b, v1.16b, v28.16b\n"
+ "zip1 v1.16b, v1.16b, v28.16b\n"
+ "ldr q30, [%x[params], #0x30]\n"
+ "zip1 v19.16b, v21.16b, v4.16b\n"
+ "zip2 v23.16b, v13.16b, v0.16b\n"
"cmp x20, #0x4\n"
- "zip1 v15.16b, v15.16b, v7.16b\n"
- "zip1 v29.16b, v25.16b, v8.16b\n"
- "zip2 v8.16b, v25.16b, v8.16b\n"
- "zip2 v10.16b, v23.16b, v10.16b\n"
- "zip2 v23.16b, v26.16b, v18.16b\n"
- "zip1 v26.16b, v26.16b, v18.16b\n"
- "zip2 v28.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v21.16b, v19.16b, v5.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4e9a9591 // sdot v17.4s, v12.16b, v26.16b\n"
- "zip2 v25.16b, v15.16b, v29.16b\n"
- "zip1 v15.16b, v15.16b, v29.16b\n"
- "zip1 v7.16b, v30.16b, v8.16b\n"
- "zip2 v8.16b, v30.16b, v8.16b\n"
+ "zip1 v13.16b, v13.16b, v0.16b\n"
+ "zip1 v22.16b, v2.16b, v9.16b\n"
+ "zip2 v9.16b, v2.16b, v9.16b\n"
+ "zip2 v4.16b, v21.16b, v4.16b\n"
+ "zip2 v21.16b, v1.16b, v19.16b\n"
+ "zip1 v1.16b, v1.16b, v19.16b\n"
+ "zip2 v16.16b, v10.16b, v27.16b\n"
+ "zip1 v10.16b, v10.16b, v27.16b\n"
+ "zip1 v19.16b, v29.16b, v6.16b\n"
+ "movi v8.4s, #0x0\n"
+ "zip2 v2.16b, v13.16b, v22.16b\n"
+ "zip1 v13.16b, v13.16b, v22.16b\n"
+ "zip1 v0.16b, v23.16b, v9.16b\n"
+ "zip2 v9.16b, v23.16b, v9.16b\n"
"ldr q31, [%x[params], #0x0]\n"
- "zip2 v5.16b, v19.16b, v5.16b\n"
- "zip2 v30.16b, v27.16b, v2.16b\n"
- "zip1 v27.16b, v27.16b, v2.16b\n"
- "zip1 v18.16b, v24.16b, v9.16b\n"
- "zip2 v9.16b, v24.16b, v9.16b\n"
- "zip2 v19.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v22.16b, v21.16b\n"
- "zip1 v3.16b, v1.16b, v10.16b\n"
- ".inst 0x4e969591 // sdot v17.4s, v12.16b, v22.16b\n"
- "zip2 v10.16b, v1.16b, v10.16b\n"
- "zip1 v0.16b, v28.16b, v5.16b\n"
- "zip2 v5.16b, v28.16b, v5.16b\n"
- "zip2 v24.16b, v27.16b, v18.16b\n"
- "zip1 v27.16b, v27.16b, v18.16b\n"
- "zip1 v2.16b, v30.16b, v9.16b\n"
- "mov v18.16b, v17.16b\n .inst 0x4e9b9592 // sdot v18.4s, v12.16b, v27.16b\n"
- "zip2 v9.16b, v30.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- ".inst 0x4e8f9591 // sdot v17.4s, v12.16b, v15.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x4e8f969f // sdot v31.4s, v20.16b, v15.16b\n"
- ".inst 0x4e9a969d // sdot v29.4s, v20.16b, v26.16b\n"
- ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "movi v1.4s, #0x0\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x4e9a9581 // sdot v1.4s, v12.16b, v26.16b\n"
- ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x4e96949f // sdot v31.4s, v4.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x4e8f969e // sdot v30.4s, v20.16b, v15.16b\n"
- ".inst 0x4e9a969c // sdot v28.4s, v20.16b, v26.16b\n"
- "mls v31.4s, v17.4s, v16.4s\n"
- ".inst 0x4e969581 // sdot v1.4s, v12.16b, v22.16b\n"
- ".inst 0x4e9b949d // sdot v29.4s, v4.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mov v20.16b, v1.16b\n .inst 0x4e9b9594 // sdot v20.4s, v12.16b, v27.16b\n"
- ".inst 0x4e8f9581 // sdot v1.4s, v12.16b, v15.16b\n"
- "ldr q18, [%x[params], #0x40]\n"
- "sqrdmulh v31.4s, v31.4s, v18.4s\n"
- ".inst 0x4e96949e // sdot v30.4s, v4.16b, v22.16b\n"
- ".inst 0x4e9b949c // sdot v28.4s, v4.16b, v27.16b\n"
- "mls v30.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e8195e8 // sdot v8.4s, v15.16b, v1.16b\n"
+ "zip2 v6.16b, v29.16b, v6.16b\n"
+ "zip2 v22.16b, v5.16b, v14.16b\n"
+ "zip1 v5.16b, v5.16b, v14.16b\n"
+ "zip1 v3.16b, v18.16b, v25.16b\n"
+ "zip2 v25.16b, v18.16b, v25.16b\n"
+ "zip2 v29.16b, v10.16b, v19.16b\n"
+ "zip1 v10.16b, v10.16b, v19.16b\n"
+ "zip1 v28.16b, v26.16b, v4.16b\n"
+ "zip2 v4.16b, v26.16b, v4.16b\n"
+ "zip1 v27.16b, v16.16b, v6.16b\n"
+ "zip2 v6.16b, v16.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v3.16b\n"
+ "zip1 v5.16b, v5.16b, v3.16b\n"
+ "zip1 v14.16b, v22.16b, v25.16b\n"
+ ".inst 0x4e8a95e8 // sdot v8.4s, v15.16b, v10.16b\n"
+ "zip2 v25.16b, v22.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x4e8d969f // sdot v31.4s, v20.16b, v13.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e819683 // sdot v3.4s, v20.16b, v1.16b\n"
+ "mov v16.16b, v8.16b\n .inst 0x4e8595f0 // sdot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95e8 // sdot v8.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e81963f // sdot v31.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e8a9623 // sdot v3.4s, v17.16b, v10.16b\n"
+ ".inst 0x4e8d969a // sdot v26.4s, v20.16b, v13.16b\n"
+ ".inst 0x4e8195f6 // sdot v22.4s, v15.16b, v1.16b\n"
+ ".inst 0x4e8a97df // sdot v31.4s, v30.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e819697 // sdot v23.4s, v20.16b, v1.16b\n"
+ ".inst 0x4e8597c3 // sdot v3.4s, v30.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e81963a // sdot v26.4s, v17.16b, v1.16b\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e8a95f6 // sdot v22.4s, v15.16b, v10.16b\n"
+ "mls v31.4s, v8.4s, v24.4s\n"
+ ".inst 0x4e8a9637 // sdot v23.4s, v17.16b, v10.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ "mov v19.16b, v22.16b\n .inst 0x4e8595f3 // sdot v19.4s, v15.16b, v5.16b\n"
+ ".inst 0x4e8d95f6 // sdot v22.4s, v15.16b, v13.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
"add %x[params], %x[params], #0x60\n"
- "mls v28.4s, v20.4s, v16.4s\n"
- "and v17.16b, v31.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v18.4s\n"
- "sqrdmulh v29.4s, v29.4s, v18.4s\n"
- "sqrdmulh v28.4s, v28.4s, v18.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v17.16b, v30.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
- "and v26.16b, v28.16b, v21.16b\n"
+ ".inst 0x4e8a97da // sdot v26.4s, v30.16b, v10.16b\n"
+ ".inst 0x4e8597d7 // sdot v23.4s, v30.16b, v5.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v26.4s, v22.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "mls v23.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "and v19.16b, v3.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v16.16b, v23.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v26.4s\n"
- "srshl v31.4s, v31.4s, v21.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v29.4s, v29.4s, v21.4s\n"
- "srshl v28.4s, v28.4s, v21.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "sqadd v3.4s, v3.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v20.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 20f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 21f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q27, [%x[params], #0x10]\n"
- "movi v1.4s, #0x0\n"
- ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
- "ldr q26, [%x[params], #0x20]\n"
- "ldr q22, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q4, [%x[params], #0x40]\n"
- "ldr q21, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x4e99977f // sdot v31.4s, v27.16b, v25.16b\n"
- ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
- ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
- "movi v20.4s, #0x0\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v8.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x30]\n"
"cmp x20, #0x4\n"
- ".inst 0x4e97975f // sdot v31.4s, v26.16b, v23.16b\n"
- "mov v18.16b, v1.16b\n .inst 0x4e989592 // sdot v18.4s, v12.16b, v24.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x4e9595e8 // sdot v8.4s, v15.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x4e99977e // sdot v30.4s, v27.16b, v25.16b\n"
- ".inst 0x4e97977c // sdot v28.4s, v27.16b, v23.16b\n"
- ".inst 0x4e979594 // sdot v20.4s, v12.16b, v23.16b\n"
- ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
- ".inst 0x4e9396df // sdot v31.4s, v22.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4e97975e // sdot v30.4s, v26.16b, v23.16b\n"
- ".inst 0x4e93975c // sdot v28.4s, v26.16b, v19.16b\n"
- "mls v31.4s, v1.4s, v16.4s\n"
- ".inst 0x4e939594 // sdot v20.4s, v12.16b, v19.16b\n"
- ".inst 0x4e9896dd // sdot v29.4s, v22.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x4e9396de // sdot v30.4s, v22.16b, v19.16b\n"
- ".inst 0x4e9896dc // sdot v28.4s, v22.16b, v24.16b\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "mov v17.16b, v20.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x4e999594 // sdot v20.4s, v12.16b, v25.16b\n"
- "mls v30.4s, v20.4s, v16.4s\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v30.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
- "and v17.16b, v28.16b, v21.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x4e8294bf // sdot v31.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e9594a3 // sdot v3.4s, v5.16b, v21.16b\n"
+ ".inst 0x4e9d95e8 // sdot v8.4s, v15.16b, v29.16b\n"
+ ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e9594b7 // sdot v23.4s, v5.16b, v21.16b\n"
+ ".inst 0x4e9595fe // sdot v30.4s, v15.16b, v21.16b\n"
+ ".inst 0x4e9d96c3 // sdot v3.4s, v22.16b, v29.16b\n"
+ "mov v16.16b, v8.16b\n .inst 0x4e9295f0 // sdot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295e8 // sdot v8.4s, v15.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e9d969f // sdot v31.4s, v20.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e8294ba // sdot v26.4s, v5.16b, v2.16b\n"
+ ".inst 0x4e929683 // sdot v3.4s, v20.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ ".inst 0x4e9d96d7 // sdot v23.4s, v22.16b, v29.16b\n"
+ ".inst 0x4e9d95fe // sdot v30.4s, v15.16b, v29.16b\n"
+ "mls v31.4s, v8.4s, v24.4s\n"
+ ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x4e929697 // sdot v23.4s, v20.16b, v18.16b\n"
+ "mov v16.16b, v30.16b\n .inst 0x4e9295f0 // sdot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x4e8295fe // sdot v30.4s, v15.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e9d969a // sdot v26.4s, v20.16b, v29.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v30.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v19.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "srshl v31.4s, v31.4s, v21.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v29.4s, v29.4s, v21.4s\n"
- "srshl v28.4s, v28.4s, v21.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 24f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 25f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q25, [%x[params], #0x10]\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x4e839598 // sdot v24.4s, v12.16b, v3.16b\n"
- "ldr q23, [%x[params], #0x20]\n"
- "ldr q22, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q21, [%x[params], #0x40]\n"
- "ldr q20, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x4e87973f // sdot v31.4s, v25.16b, v7.16b\n"
- ".inst 0x4e809598 // sdot v24.4s, v12.16b, v0.16b\n"
- ".inst 0x4e83973d // sdot v29.4s, v25.16b, v3.16b\n"
- "movi v19.4s, #0x0\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ "ldr q18, [%x[params], #0x30]\n"
"cmp x20, #0x4\n"
- ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
- "mov v18.16b, v24.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x4e9c95f6 // sdot v22.4s, v15.16b, v28.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e879598 // sdot v24.4s, v12.16b, v7.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e87973e // sdot v30.4s, v25.16b, v7.16b\n"
- ".inst 0x4e83973c // sdot v28.4s, v25.16b, v3.16b\n"
- ".inst 0x4e839593 // sdot v19.4s, v12.16b, v3.16b\n"
- ".inst 0x4e8096fd // sdot v29.4s, v23.16b, v0.16b\n"
- ".inst 0x4e8096df // sdot v31.4s, v22.16b, v0.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x4e8097bf // sdot v31.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e9c97a3 // sdot v3.4s, v29.16b, v28.16b\n"
+ ".inst 0x4e9b95f6 // sdot v22.4s, v15.16b, v27.16b\n"
+ ".inst 0x4e9c969f // sdot v31.4s, v20.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e9c97b7 // sdot v23.4s, v29.16b, v28.16b\n"
+ ".inst 0x4e9c95f5 // sdot v21.4s, v15.16b, v28.16b\n"
+ ".inst 0x4e9b9683 // sdot v3.4s, v20.16b, v27.16b\n"
+ "mov v16.16b, v22.16b\n .inst 0x4e8e95f0 // sdot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095f6 // sdot v22.4s, v15.16b, v0.16b\n"
"ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e8396fe // sdot v30.4s, v23.16b, v3.16b\n"
- ".inst 0x4e8096fc // sdot v28.4s, v23.16b, v0.16b\n"
- "mls v31.4s, v24.4s, v16.4s\n"
- ".inst 0x4e809593 // sdot v19.4s, v12.16b, v0.16b\n"
- ".inst 0x4e8296dd // sdot v29.4s, v22.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x4e8096de // sdot v30.4s, v22.16b, v0.16b\n"
- ".inst 0x4e8296dc // sdot v28.4s, v22.16b, v2.16b\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "mov v17.16b, v19.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x4e879593 // sdot v19.4s, v12.16b, v7.16b\n"
- "mls v30.4s, v19.4s, v16.4s\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v20.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v30.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v28.16b, v20.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ ".inst 0x4e9b965f // sdot v31.4s, v18.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8097ba // sdot v26.4s, v29.16b, v0.16b\n"
+ ".inst 0x4e8e9643 // sdot v3.4s, v18.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ ".inst 0x4e9b9697 // sdot v23.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e9b95f5 // sdot v21.4s, v15.16b, v27.16b\n"
+ "mls v31.4s, v22.4s, v24.4s\n"
+ ".inst 0x4e9c969a // sdot v26.4s, v20.16b, v28.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x4e8e9657 // sdot v23.4s, v18.16b, v14.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x4e8e95f0 // sdot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x4e8095f5 // sdot v21.4s, v15.16b, v0.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e9b965a // sdot v26.4s, v18.16b, v27.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v19.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 28f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 29f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q23, [%x[params], #0x10]\n"
+ "ldr q1, [%x[params], #0x10]\n"
"movi v22.4s, #0x0\n"
- ".inst 0x4e8a9596 // sdot v22.4s, v12.16b, v10.16b\n"
- "ldr q21, [%x[params], #0x20]\n"
- "ldr q19, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q20, [%x[params], #0x40]\n"
- "ldr q26, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x4e8896ff // sdot v31.4s, v23.16b, v8.16b\n"
- ".inst 0x4e859596 // sdot v22.4s, v12.16b, v5.16b\n"
- ".inst 0x4e8a96fd // sdot v29.4s, v23.16b, v10.16b\n"
- "movi v18.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x4e8495f6 // sdot v22.4s, v15.16b, v4.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e8a96bf // sdot v31.4s, v21.16b, v10.16b\n"
- "mov v17.16b, v22.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- ".inst 0x4e889596 // sdot v22.4s, v12.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x4e8896fe // sdot v30.4s, v23.16b, v8.16b\n"
- ".inst 0x4e8a96fc // sdot v28.4s, v23.16b, v10.16b\n"
- ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
- ".inst 0x4e8596bd // sdot v29.4s, v21.16b, v5.16b\n"
- ".inst 0x4e85967f // sdot v31.4s, v19.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8a96be // sdot v30.4s, v21.16b, v10.16b\n"
- ".inst 0x4e8596bc // sdot v28.4s, v21.16b, v5.16b\n"
- "mls v31.4s, v22.4s, v16.4s\n"
- ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
- ".inst 0x4e89967d // sdot v29.4s, v19.16b, v9.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x4e89943f // sdot v31.4s, v1.16b, v9.16b\n"
+ ".inst 0x4e849423 // sdot v3.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e8695f6 // sdot v22.4s, v15.16b, v6.16b\n"
+ ".inst 0x4e84969f // sdot v31.4s, v20.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x4e849437 // sdot v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e8495f5 // sdot v21.4s, v15.16b, v4.16b\n"
+ ".inst 0x4e869683 // sdot v3.4s, v20.16b, v6.16b\n"
+ "mov v16.16b, v22.16b\n .inst 0x4e9995f0 // sdot v16.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995f6 // sdot v22.4s, v15.16b, v9.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x4e85967e // sdot v30.4s, v19.16b, v5.16b\n"
- ".inst 0x4e89967c // sdot v28.4s, v19.16b, v9.16b\n"
- "sqrdmulh v31.4s, v31.4s, v20.4s\n"
- "mov v7.16b, v18.16b\n .inst 0x4e899587 // sdot v7.4s, v12.16b, v9.16b\n"
- ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mls v28.4s, v7.4s, v16.4s\n"
- "and v16.16b, v31.16b, v26.16b\n"
+ ".inst 0x4e86965f // sdot v31.4s, v18.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x4e89943a // sdot v26.4s, v1.16b, v9.16b\n"
+ ".inst 0x4e999643 // sdot v3.4s, v18.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e869697 // sdot v23.4s, v20.16b, v6.16b\n"
+ ".inst 0x4e8695f5 // sdot v21.4s, v15.16b, v6.16b\n"
+ "mls v31.4s, v22.4s, v24.4s\n"
+ ".inst 0x4e84969a // sdot v26.4s, v20.16b, v4.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x4e999657 // sdot v23.4s, v18.16b, v25.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x4e9995f0 // sdot v16.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e8995f5 // sdot v21.4s, v15.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e86965a // sdot v26.4s, v18.16b, v6.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v20.4s\n"
- "sqrdmulh v29.4s, v29.4s, v20.4s\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v18.16b, v30.16b, v26.16b\n"
- "and v17.16b, v29.16b, v26.16b\n"
- "and v16.16b, v28.16b, v26.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v26.4s\n"
- "srshl v30.4s, v30.4s, v26.4s\n"
- "srshl v29.4s, v29.4s, v26.4s\n"
- "srshl v28.4s, v28.4s, v26.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 33f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4626007afa..5db236747a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,1072 +91,1072 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v14.16b }, [x20]\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v19.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_minval]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v29.8h }, [x21]\n"
- "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "lsr x11, x8, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.16b }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v16.16b }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x11, 3f\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
"add x20, x20, #0x20\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x23, x22, [x15, #0x0]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d22, [x23, x17]\n"
- "ldr d4, [x22, x17]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d8, [x21, x17]\n"
- "ldr d27, [x20, x17]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
"ldr x20, [x15, #0x20]\n"
- "ldr d15, [x20, x17]\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
- "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "ssubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v11.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q3, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q28, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ssubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ldr x27, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "ldr x26, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr q17, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q28, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x24, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x23, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x21, [x15, #0x40]\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "ldr x26, [x15, #0x48]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr d21, [x24, x17]\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x21, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x25, [x15, #0x58]\n"
"ldr x24, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
"ldr x23, [x15, #0x68]\n"
+ "ssubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x22, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x21, [x15, #0x78]\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x20, x17]\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x27, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x26, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
"add x14, x14, #0x48\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "ssubl v11.8h, v11.8b, v14.8b\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "ssubl v22.8h, v22.8b, v11.8b\n"
+ "subs x11, x11, #0x1\n"
+ "smlal v31.4s, v19.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
"add x13, x13, #0x20\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x25, x17]\n"
"add x12, x12, #0x20\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x25, x17]\n"
- "ssubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x24, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x23, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "ssubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d8, [x21, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v2.4s, v18.4h, v4.4h\n"
+ "smlal2 v1.4s, v18.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v9.8h\n"
+ "ldr d19, [x24, x17]\n"
+ "smlal v8.4s, v18.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "ssubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v18.8h, v15.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "ssubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x21, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v3.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v27.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v28.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v8.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v3.4s\n"
- "smlal v10.4s, v8.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v3.4s\n"
- "smlal2 v30.4s, v8.8h, v20.8h\n"
- "smlal2 v6.4s, v8.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v3.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v20.16b, v0.16b, v28.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v31.16b, v30.16b, v28.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v18.16b, v6.16b, v28.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "ssubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "ssubl v4.8h, v4.8b, v11.8b\n"
+ "smlal v31.4s, v18.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v19.4h, v3.4h\n"
+ "smlal2 v24.4s, v19.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v18.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v19.4h, v6.4h\n"
+ "smlal2 v1.4s, v19.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal v31.4s, v4.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v9.8h\n"
+ "smlal v0.4s, v4.4h, v9.4h\n"
+ "smlal2 v24.4s, v4.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v28.4s\n"
+ "smlal2 v27.4s, v4.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "and v18.16b, v2.16b, v26.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v28.4s\n"
+ "and v4.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v26.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v26.16b\n"
+ "sqadd v2.4s, v2.4s, v18.4s\n"
+ "and v19.16b, v31.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v4.4s\n"
"sshr v20.4s, v20.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v3.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
"sqadd v0.4s, v0.4s, v20.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v31.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v28.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v28.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v26.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v27.4s, v27.4s, v3.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v28.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x23, x22, [x15, #0x0]\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "ldr d22, [x23, x17]\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
- "ldr d4, [x22, x17]\n"
- "ldr d8, [x21, x17]\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ldr d27, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"ldr x20, [x15, #0x20]\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ldr d15, [x20, x17]\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
- "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "ssubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v11.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q28, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q3, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "ssubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q17, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x23, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x22, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x21, [x15, #0x30]\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
"ldr x26, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
"ldr x25, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr d21, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x24, [x15, #0x58]\n"
"ldr x23, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
"ldr x22, [x15, #0x68]\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x21, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x20, [x15, #0x78]\n"
- "tst x7, #0x7\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x26, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x25, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "ssubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x25, x17]\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "tst x8, #0x7\n"
"add x13, x13, #0x20\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "ssubl v22.8h, v22.8b, v11.8b\n"
"add x12, x12, #0x20\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x24, x17]\n"
- "ssubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x23, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x22, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x21, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "ssubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d16, [x20, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "ssubl v16.8h, v16.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v31.4s, v18.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x24, x17]\n"
+ "smlal v2.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v18.8h, v9.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v19.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "ssubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v19.8h, v15.8h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "ssubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v28.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v1.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v1.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v3.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v16.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v28.4s\n"
- "smlal v10.4s, v16.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v28.4s\n"
- "smlal2 v30.4s, v16.8h, v20.8h\n"
- "smlal2 v6.4s, v16.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v28.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v15.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "ssubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v31.4s, v19.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v18.4h, v3.4h\n"
+ "smlal2 v24.4s, v18.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v19.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v18.4h, v6.4h\n"
+ "smlal2 v1.4s, v18.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v19.4h, v9.4h\n"
+ "smlal v31.4s, v29.4h, v7.4h\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal v0.4s, v29.4h, v9.4h\n"
+ "smlal2 v24.4s, v29.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v26.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+ "smlal2 v27.4s, v29.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v26.4s\n"
+ "and v25.16b, v2.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v26.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "and v22.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v28.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v17.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v0.16b, v3.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v23.16b, v30.16b, v3.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v21.16b, v6.16b, v3.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sqadd v2.4s, v2.4s, v25.4s\n"
+ "and v19.16b, v31.16b, v28.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v23.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v21.4s\n"
- "srshl v24.4s, v24.4s, v3.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
+ "and v10.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v22.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v28.4s\n"
+ "srshl v8.4s, v8.4s, v28.4s\n"
+ "sqadd v30.4s, v30.4s, v10.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v3.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
"beq 64f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v9.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v24.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[2], [x20]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v2.4s }, [x20], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v9.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[2], [x20]\n"
+ "tbz x8, #1, 6f\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x24, x23, [x15, #0x0]\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
"ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"add x24, x24, x17\n"
"add x23, x23, x17\n"
+ "ldr x20, [x15, #0x20]\n"
"add x22, x22, x17\n"
"add x21, x21, x17\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v22.s }[0], [x24], #0x4\n"
- "ld1 { v4.s }[0], [x23], #0x4\n"
- "ld1 { v8.s }[0], [x22], #0x4\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v22.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v8.h }[2], [x22], #0x2\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "ld1 { v8.b }[6], [x22]\n"
- "ld1 { v27.b }[6], [x21]\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v21.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v22.s }[0], [x21], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v21.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v21.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v22.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "ld1 { v8.b }[4], [x22]\n"
- "ld1 { v27.b }[4], [x21]\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v21.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v22.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v22.h }[0], [x24], #0x2\n"
- "ld1 { v4.h }[0], [x23], #0x2\n"
- "ld1 { v8.h }[0], [x22], #0x2\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v8.b }[2], [x22]\n"
- "ld1 { v27.b }[2], [x21]\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x8, #1, 10f\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v21.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v22.h }[0], [x21], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v21.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v22.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[0], [x24]\n"
- "ld1 { v4.b }[0], [x23]\n"
- "ld1 { v8.b }[0], [x22]\n"
- "ld1 { v27.b }[0], [x21]\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v21.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v22.b }[0], [x21]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v22.8h, v22.8b, v14.8b\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ssubl v19.8h, v19.8b, v11.8b\n"
+ "ssubl v21.8h, v21.8b, v11.8b\n"
"ldr x20, [x15, #0x28]\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "ssubl v8.8h, v8.8b, v14.8b\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ssubl v29.8h, v29.8b, v11.8b\n"
+ "ssubl v22.8h, v22.8b, v11.8b\n"
+ "ssubl v20.8h, v20.8b, v11.8b\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
"add x20, x20, x17\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ssubl v15.8h, v15.8b, v14.8b\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v21.8h, v21.8b, v14.8b\n"
- "smlal v2.4s, v21.4h, v31.4h\n"
- "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x30]\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v0.4s, v17.4h, v6.4h\n"
+ "smlal2 v24.4s, v17.8h, v6.8h\n"
"add x20, x20, x17\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x38]\n"
- "smlal v10.4s, v28.4h, v20.4h\n"
- "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "smlal v31.4s, v16.4h, v9.4h\n"
+ "smlal2 v27.4s, v16.8h, v9.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x8, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v9.4s, v22.4h, v16.4h\n"
- "smlal2 v24.4s, v22.8h, v16.8h\n"
- "smlal v7.4s, v22.4h, v23.4h\n"
- "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "smlal v2.4s, v17.4h, v4.4h\n"
+ "smlal2 v1.4s, v17.8h, v4.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v30.4s, v17.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ssubl v21.8h, v21.8b, v14.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x48]\n"
- "smlal v9.4s, v21.4h, v1.4h\n"
- "smlal2 v24.4s, v21.8h, v1.8h\n"
- "smlal v7.4s, v21.4h, v16.4h\n"
- "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "smlal v2.4s, v16.4h, v5.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x50]\n"
- "smlal v9.4s, v28.4h, v20.4h\n"
- "smlal2 v24.4s, v28.8h, v20.8h\n"
- "smlal v7.4s, v28.4h, v25.4h\n"
- "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "smlal v2.4s, v17.4h, v9.4h\n"
+ "smlal2 v1.4s, v17.8h, v9.8h\n"
+ "smlal v8.4s, v17.4h, v7.4h\n"
+ "smlal2 v30.4s, v17.8h, v7.8h\n"
+ "smlal v0.4s, v17.4h, v10.4h\n"
+ "smlal2 v24.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v25.4h\n"
"add x20, x20, x17\n"
- "smlal v2.4s, v28.4h, v18.4h\n"
- "smlal2 v30.4s, v28.8h, v18.8h\n"
- "smlal v10.4s, v28.4h, v26.4h\n"
- "smlal2 v6.4s, v28.8h, v26.8h\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "smlal2 v27.4s, v17.8h, v25.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v9.4s, v8.4h, v5.4h\n"
- "smlal2 v24.4s, v8.8h, v5.8h\n"
- "smlal v2.4s, v8.4h, v23.4h\n"
- "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "smlal v2.4s, v16.4h, v3.4h\n"
+ "smlal2 v1.4s, v16.8h, v3.8h\n"
+ "smlal v0.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v7.4s, v8.4h, v18.4h\n"
- "smlal2 v0.4s, v8.8h, v18.8h\n"
- "smlal v10.4s, v8.4h, v1.4h\n"
- "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "smlal v8.4s, v17.4h, v10.4h\n"
+ "smlal2 v30.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v5.4h\n"
+ "smlal2 v27.4s, v17.8h, v5.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v17.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v17.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[6], [x20]\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[4], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v17.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[2], [x20]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[0], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v17.8h, v17.8b, v14.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v9.4s, v17.4h, v31.4h\n"
- "smlal2 v24.4s, v17.8h, v31.8h\n"
- "smlal v2.4s, v17.4h, v5.4h\n"
- "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "smlal v2.4s, v16.4h, v6.4h\n"
+ "smlal2 v1.4s, v16.8h, v6.8h\n"
+ "smlal v0.4s, v16.4h, v3.4h\n"
+ "smlal2 v24.4s, v16.8h, v3.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v23.8h, v23.8b, v14.8b\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x70]\n"
- "smlal v7.4s, v23.4h, v20.4h\n"
- "smlal2 v0.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v18.4h\n"
- "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v30.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v10.4h\n"
+ "smlal2 v27.4s, v17.8h, v10.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ssubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x78]\n"
- "smlal v2.4s, v5.4h, v25.4h\n"
- "smlal2 v30.4s, v5.8h, v25.8h\n"
- "smlal v10.4s, v5.4h, v31.4h\n"
- "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "smlal v0.4s, v16.4h, v7.4h\n"
+ "smlal2 v24.4s, v16.8h, v7.8h\n"
+ "smlal v31.4s, v16.4h, v6.4h\n"
+ "smlal2 v27.4s, v16.8h, v6.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v23.8h, v23.8b, v14.8b\n"
- "smlal v2.4s, v23.4h, v20.4h\n"
- "smlal2 v30.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v25.4h\n"
- "smlal2 v6.4s, v23.8h, v25.8h\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v15.4s }, [x13], #0x10\n"
- "ld1 { v19.4s }, [x12], #0x10\n"
- "tbz x7, #1, 56f\n"
+ "ssubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v0.4s, v17.4h, v9.4h\n"
+ "smlal2 v24.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v7.4h\n"
+ "smlal2 v27.4s, v17.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v16.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v18.d }[0], [x13], #0x8\n"
"ld1 { v22.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[2], [x13]\n"
"ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[0], [x13]\n"
"ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v15.d }[0], [x13], #0x8\n"
- "ld1 { v19.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x12]\n"
+ "tbz x8, #1, 58f\n"
+ "ld1 { v16.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[0], [x13]\n"
- "ld1 { v19.s }[0], [x12]\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v9.4s, v9.4s, v15.4s\n"
- "and v17.16b, v9.16b, v19.16b\n"
- "add x11, x11, x16\n"
+ "sqrdmulh v2.4s, v2.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
"add x10, x10, x16\n"
- "sqrdmulh v24.4s, v24.4s, v18.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
"add x9, x9, x16\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v16.4s\n"
"add x28, x28, x16\n"
- "and v20.16b, v24.16b, v22.16b\n"
- "sqrdmulh v7.4s, v7.4s, v15.4s\n"
- "sqrdmulh v2.4s, v2.4s, v15.4s\n"
- "sqrdmulh v10.4s, v10.4s, v15.4s\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v21.16b, v7.16b, v19.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v15.16b, v2.16b, v19.16b\n"
+ "add x27, x27, x16\n"
+ "sqrdmulh v31.4s, v31.4s, v16.4s\n"
"sqrdmulh v30.4s, v30.4s, v18.4s\n"
- "and v23.16b, v10.16b, v19.16b\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
+ "and v17.16b, v2.16b, v23.16b\n"
+ "and v16.16b, v1.16b, v22.16b\n"
+ "and v21.16b, v8.16b, v23.16b\n"
+ "and v20.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v18.16b, v0.16b, v22.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v17.16b, v30.16b, v22.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v28.16b, v6.16b, v22.16b\n"
- "sqadd v7.4s, v7.4s, v21.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v18.16b, v24.16b, v22.16b\n"
+ "sqadd v2.4s, v2.4s, v17.4s\n"
+ "and v17.16b, v31.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v22.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v23.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v19.4s\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v19.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v6.4s, v6.4s, v28.4s\n"
- "srshl v24.4s, v24.4s, v22.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v22.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v23.4s\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v22.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v22.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "tbz x7, #2, 61f\n"
- "st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v7.s }[0], [x10], #0x4\n"
- "st1 { v2.s }[0], [x9], #0x4\n"
- "st1 { v10.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 60f\n"
- "st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v7.h }[2], [x10], #0x2\n"
- "st1 { v2.h }[2], [x9], #0x2\n"
- "st1 { v10.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v7.b }[6], [x10], #0x1\n"
- "st1 { v2.b }[6], [x9], #0x1\n"
- "st1 { v10.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v2.s }[0], [x10], #0x4\n"
+ "st1 { v8.s }[0], [x9], #0x4\n"
+ "st1 { v0.s }[0], [x28], #0x4\n"
+ "st1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v2.h }[2], [x10], #0x2\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v0.h }[2], [x28], #0x2\n"
+ "st1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[6], [x10], #0x1\n"
+ "st1 { v8.b }[6], [x9], #0x1\n"
+ "st1 { v0.b }[6], [x28], #0x1\n"
+ "st1 { v31.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v7.b }[4], [x10], #0x1\n"
- "st1 { v2.b }[4], [x9], #0x1\n"
- "st1 { v10.b }[4], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[4], [x10], #0x1\n"
+ "st1 { v8.b }[4], [x9], #0x1\n"
+ "st1 { v0.b }[4], [x28], #0x1\n"
+ "st1 { v31.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v7.h }[0], [x10], #0x2\n"
- "st1 { v2.h }[0], [x9], #0x2\n"
- "st1 { v10.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v7.b }[2], [x10], #0x1\n"
- "st1 { v2.b }[2], [x9], #0x1\n"
- "st1 { v10.b }[2], [x28], #0x1\n"
+ "tbz x8, #1, 62f\n"
+ "st1 { v2.h }[0], [x10], #0x2\n"
+ "st1 { v8.h }[0], [x9], #0x2\n"
+ "st1 { v0.h }[0], [x28], #0x2\n"
+ "st1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[2], [x10], #0x1\n"
+ "st1 { v8.b }[2], [x9], #0x1\n"
+ "st1 { v0.b }[2], [x28], #0x1\n"
+ "st1 { v31.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v7.b }[0], [x10], #0x1\n"
- "st1 { v2.b }[0], [x9], #0x1\n"
- "st1 { v10.b }[0], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[0], [x10], #0x1\n"
+ "st1 { v8.b }[0], [x9], #0x1\n"
+ "st1 { v0.b }[0], [x28], #0x1\n"
+ "st1 { v31.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index d98ab71cb8..d26a37c654 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,1294 +100,1294 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v6.16b }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x17, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "ld1r { v14.16b }, [x21]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v17.8h }, [x21]\n"
- "ld1r { v24.8h }, [x20]\n"
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
+ "ld1r { v23.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x17, 3f\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "subs x17, x17, #0x1\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
"add x20, x20, #0x20\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d25, [x27, x17]\n"
- "ldr d27, [x26, x17]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d1, [x25, x17]\n"
- "ldr d2, [x24, x17]\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "ldr d12, [x23, x17]\n"
- "ldr d16, [x22, x17]\n"
- "ssubl v1.8h, v1.8b, v6.8b\n"
- "ssubl v2.8h, v2.8b, v6.8b\n"
- "ldr d23, [x21, x17]\n"
- "ldr d10, [x20, x17]\n"
- "ssubl v12.8h, v12.8b, v6.8b\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ssubl v23.8h, v23.8b, v6.8b\n"
- "ssubl v10.8h, v10.8b, v6.8b\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "ldr d18, [x20, x3]\n"
+ "ssubl v26.8h, v26.8b, v13.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ssubl v20.8h, v20.8b, v13.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v13.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q30, [x13, #0x0]\n"
- "ldr q29, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "ssubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "ssubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "ssubl v12.8h, v12.8b, v6.8b\n"
- "ssubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "ssubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "ssubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d18, [x22, x17]\n"
- "ldr d16, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "ssubl v18.8h, v18.8b, v6.8b\n"
- "ldr x21, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q30, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x24, [x5, #0x58]\n"
+ "ldr x23, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x22, [x5, #0x60]\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x24, x3]\n"
+ "ldr x12, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x11, [x5, #0x40]\n"
+ "ldr x10, [x5, #0x70]\n"
+ "add x6, x6, #0x48\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x23, x3]\n"
+ "ldr x9, [x5, #0x98]\n"
+ "subs x17, x17, #0x1\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x28, [x5, #0x50]\n"
+ "ldr x27, [x5, #0x48]\n"
+ "add x7, x7, #0x20\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x22, x3]\n"
+ "ldr x26, [x5, #0x90]\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x25, [x5, #0xa8]\n"
+ "ldr x24, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x21, x3]\n"
+ "ldr x23, [x5, #0xb0]\n"
+ "ssubl v20.8h, v20.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "ldr x22, [x5, #0xb8]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x21, [x5, #0xc0]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x21, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "ssubl v22.8h, v22.8b, v6.8b\n"
- "add x14, x14, #0x48\n"
- "smlal v20.4s, v18.4h, v7.4h\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v30.4s\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v18.8h, v7.8h\n"
- "and v28.16b, v5.16b, v29.16b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x11, x3]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x10, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x9, x3]\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x28, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "ldr d27, [x26, x3]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x25, x3]\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x24, x3]\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x23, x3]\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x22, x3]\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "ssubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "ssubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "add x8, x8, #0x20\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "and v2.16b, v19.16b, v30.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "and v11.16b, v1.16b, v27.16b\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v2.4s\n"
+ "and v29.16b, v8.16b, v30.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "and v20.16b, v10.16b, v30.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "and v28.16b, v3.16b, v30.16b\n"
+ "sqadd v1.4s, v1.4s, v11.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v7.16b, v4.16b, v27.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v2.16b, v21.16b, v27.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "add x12, x12, #0x20\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v30.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v21.16b, v29.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v23.16b, v20.16b, v29.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v9.16b, v19.16b, v29.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v12.4s, v12.4s, #0x1f\n"
- "and v18.16b, v8.16b, v25.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v22.16b, v0.16b, v25.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v12.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v23.4s\n"
+ "and v22.16b, v24.16b, v27.16b\n"
+ "sqadd v8.4s, v8.4s, v29.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v28.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v9.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v29.4s\n"
- "sqadd v0.4s, v0.4s, v22.4s\n"
- "srshl v19.4s, v19.4s, v29.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v30.4s\n"
+ "sqadd v4.4s, v4.4s, v7.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v2.4s\n"
+ "srshl v3.4s, v3.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ldr d25, [x27, x17]\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ldr d27, [x26, x17]\n"
- "ldr d1, [x25, x17]\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d2, [x24, x17]\n"
- "ldr d12, [x23, x17]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "ldr d16, [x22, x17]\n"
- "ldr d23, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "ssubl v1.8h, v1.8b, v6.8b\n"
- "ldr d10, [x20, x17]\n"
- "ssubl v2.8h, v2.8b, v6.8b\n"
- "ssubl v12.8h, v12.8b, v6.8b\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ssubl v23.8h, v23.8b, v6.8b\n"
- "ssubl v10.8h, v10.8b, v6.8b\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "ssubl v26.8h, v26.8b, v13.8b\n"
+ "ldr d18, [x20, x3]\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ssubl v20.8h, v20.8b, v13.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v18.8h, v18.8b, v13.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q29, [x13, #0x0]\n"
- "ldr q30, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "ssubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "ssubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "ssubl v12.8h, v12.8b, v6.8b\n"
- "ssubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "ssubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "ssubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d16, [x22, x17]\n"
- "ldr d18, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "ssubl v18.8h, v18.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
- "tst x7, #0x7\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "ssubl v22.8h, v22.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal v19.4s, v18.4h, v7.4h\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v29.4s\n"
- "add x12, x12, #0x20\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "and v16.16b, v5.16b, v30.16b\n"
- "smlal2 v31.4s, v18.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v29.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v29.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "ldr q30, [x7, #0x0]\n"
+ "ldr q17, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x24, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x23, [x5, #0x60]\n"
+ "ldr x10, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x22, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x20, x3]\n"
+ "ldr x21, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x20, [x5, #0x40]\n"
+ "ldr x9, [x5, #0x70]\n"
+ "tst x2, #0x7\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr x28, [x5, #0x98]\n"
+ "add x7, x7, #0x20\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x27, [x5, #0x50]\n"
+ "ldr x26, [x5, #0x48]\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x23, x3]\n"
+ "ldr x25, [x5, #0x90]\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x24, [x5, #0xa8]\n"
+ "ldr x23, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x10, x3]\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "ssubl v20.8h, v20.8b, v13.8b\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x22, [x5, #0xb0]\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x21, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "ldr x21, [x5, #0xb8]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x9, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x28, x3]\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x27, x3]\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x26, x3]\n"
+ "ldr d27, [x25, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x23, x3]\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x22, x3]\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x21, x3]\n"
+ "ssubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "ssubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "add x8, x8, #0x20\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "and v5.16b, v19.16b, v17.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "and v16.16b, v1.16b, v27.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v30.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v23.16b, v21.16b, v30.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v27.16b, v20.16b, v30.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v22.16b, v19.16b, v30.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v14.16b, v8.16b, v25.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v18.16b, v0.16b, v25.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v23.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v27.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v5.4s\n"
+ "and v30.16b, v8.16b, v17.16b\n"
+ "and v20.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v2.16b, v3.16b, v17.16b\n"
+ "and v11.16b, v4.16b, v27.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v27.16b\n"
+ "and v16.16b, v24.16b, v27.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v30.4s\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v30.4s\n"
- "srshl v21.4s, v21.4s, v30.4s\n"
- "sqadd v8.4s, v8.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqadd v3.4s, v3.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v17.4s\n"
+ "srshl v8.4s, v8.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v11.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "srshl v3.4s, v3.4s, v17.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 88f\n"
- "add x14, x14, #0x48\n"
+ "add x6, x6, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v5.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v3.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[2], [x20]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v19.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[2], [x20]\n"
+ "tbz x2, #1, 6f\n"
+ "ld1 { v19.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x27, x27, x17\n"
- "add x26, x26, x17\n"
- "add x25, x25, x17\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "ld1 { v1.s }[0], [x25], #0x4\n"
- "ld1 { v2.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v16.s }[0], [x22], #0x4\n"
- "ld1 { v23.s }[0], [x21], #0x4\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
- "ld1 { v2.h }[2], [x24], #0x2\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
- "ld1 { v16.h }[2], [x22], #0x2\n"
- "ld1 { v23.h }[2], [x21], #0x2\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v27.b }[6], [x26]\n"
- "ld1 { v1.b }[6], [x25]\n"
- "ld1 { v2.b }[6], [x24]\n"
- "ld1 { v12.b }[6], [x23]\n"
- "ld1 { v16.b }[6], [x22]\n"
- "ld1 { v23.b }[6], [x21]\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v20.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v0.s }[0], [x21], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v20.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v0.h }[2], [x21], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v20.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v6.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v0.b }[6], [x21]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v27.b }[4], [x26]\n"
- "ld1 { v1.b }[4], [x25]\n"
- "ld1 { v2.b }[4], [x24]\n"
- "ld1 { v12.b }[4], [x23]\n"
- "ld1 { v16.b }[4], [x22]\n"
- "ld1 { v23.b }[4], [x21]\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v20.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v6.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v0.b }[4], [x21]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "ld1 { v1.h }[0], [x25], #0x2\n"
- "ld1 { v2.h }[0], [x24], #0x2\n"
- "ld1 { v12.h }[0], [x23], #0x2\n"
- "ld1 { v16.h }[0], [x22], #0x2\n"
- "ld1 { v23.h }[0], [x21], #0x2\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v27.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v12.b }[2], [x23]\n"
- "ld1 { v16.b }[2], [x22]\n"
- "ld1 { v23.b }[2], [x21]\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x2, #1, 10f\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v20.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v0.h }[0], [x21], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v20.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v6.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v0.b }[2], [x21]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v27.b }[0], [x26]\n"
- "ld1 { v1.b }[0], [x25]\n"
- "ld1 { v2.b }[0], [x24]\n"
- "ld1 { v12.b }[0], [x23]\n"
- "ld1 { v16.b }[0], [x22]\n"
- "ld1 { v23.b }[0], [x21]\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v20.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v6.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v0.b }[0], [x21]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v25.8h, v25.8b, v6.8b\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x20, [x15, #0x40]\n"
- "ssubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ssubl v1.8h, v1.8b, v6.8b\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ssubl v2.8h, v2.8b, v6.8b\n"
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ssubl v12.8h, v12.8b, v6.8b\n"
- "ssubl v23.8h, v23.8b, v6.8b\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ssubl v10.8h, v10.8b, v6.8b\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ssubl v26.8h, v26.8b, v13.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x40]\n"
+ "ssubl v20.8h, v20.8b, v13.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "ssubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x2, #1, 14f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v15.8h, v15.8b, v6.8b\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v21.4s, v15.4h, v18.4h\n"
- "smlal2 v8.4s, v15.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x48]\n"
+ "smlal v8.4s, v30.4h, v27.4h\n"
+ "smlal2 v4.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 18f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x50]\n"
- "smlal v21.4s, v16.4h, v9.4h\n"
- "smlal2 v8.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x50]\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v5.4s, v16.4h, v9.4h\n"
- "smlal2 v3.4s, v16.8h, v9.8h\n"
- "smlal v21.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v19.4s, v17.4h, v2.4h\n"
+ "smlal2 v1.4s, v17.8h, v2.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v4.4s, v17.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 26f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v20.4s, v16.4h, v28.4h\n"
- "smlal2 v0.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v5.4s, v16.4h, v26.4h\n"
- "smlal2 v3.4s, v16.8h, v26.8h\n"
- "smlal v20.4s, v16.4h, v11.4h\n"
- "smlal2 v0.4s, v16.8h, v11.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v17.4h, v22.4h\n"
+ "smlal2 v1.4s, v17.8h, v22.8h\n"
+ "smlal v10.4s, v17.4h, v16.4h\n"
+ "smlal2 v21.4s, v17.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 34f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v20.4s, v16.4h, v18.4h\n"
- "smlal2 v0.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v10.4s, v30.4h, v27.4h\n"
+ "smlal2 v21.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v16.4h, v7.4h\n"
- "smlal2 v3.4s, v16.8h, v7.8h\n"
- "smlal v20.4s, v16.4h, v22.4h\n"
- "smlal2 v0.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v19.4s, v17.4h, v5.4h\n"
+ "smlal2 v1.4s, v17.8h, v5.8h\n"
+ "smlal v10.4s, v17.4h, v11.4h\n"
+ "smlal2 v21.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 41f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
+ "tbz x2, #1, 40f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 42f\n"
+ "tbz x2, #1, 42f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x80]\n"
- "smlal v19.4s, v16.4h, v18.4h\n"
- "smlal2 v31.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v3.4s, v16.4h, v27.4h\n"
+ "smlal2 v24.4s, v16.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x88]\n"
- "smlal v21.4s, v16.4h, v7.4h\n"
- "smlal2 v8.4s, v16.8h, v7.8h\n"
- "smlal v19.4s, v16.4h, v22.4h\n"
- "smlal2 v31.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v8.4s, v17.4h, v5.4h\n"
+ "smlal2 v4.4s, v17.8h, v5.8h\n"
+ "smlal v3.4s, v17.4h, v11.4h\n"
+ "smlal2 v24.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 49f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
+ "tbz x2, #1, 48f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x7, #1, 50f\n"
+ "tbz x2, #1, 50f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x90]\n"
- "smlal v19.4s, v16.4h, v9.4h\n"
- "smlal2 v31.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v3.4s, v16.4h, v2.4h\n"
+ "smlal2 v24.4s, v16.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x98]\n"
- "smlal v20.4s, v16.4h, v26.4h\n"
- "smlal2 v0.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 57f\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v10.4s, v17.4h, v22.4h\n"
+ "smlal2 v21.4s, v17.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 57f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 56f\n"
+ "tbz x2, #1, 56f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x7, #1, 58f\n"
+ "tbz x2, #1, 58f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal v21.4s, v16.4h, v4.4h\n"
- "smlal2 v8.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v14.4h\n"
- "smlal2 v31.4s, v16.8h, v14.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 61f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 60f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v8.4s, v16.4h, v7.4h\n"
+ "smlal2 v4.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v29.4h\n"
+ "smlal2 v24.4s, v16.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 62f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 65f\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v10.4s, v17.4h, v5.4h\n"
+ "smlal2 v21.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 65f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 64f\n"
+ "tbz x2, #1, 64f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 66f\n"
+ "tbz x2, #1, 66f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb0]\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v0.4s, v16.8h, v9.8h\n"
- "smlal v19.4s, v16.4h, v28.4h\n"
- "smlal2 v31.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 69f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 68f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v10.4s, v16.4h, v2.4h\n"
+ "smlal2 v21.4s, v16.8h, v2.8h\n"
+ "smlal v3.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x7, #1, 70f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 70f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 73f\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v3.4s, v17.4h, v5.4h\n"
+ "smlal2 v24.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 73f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 72f\n"
+ "tbz x2, #1, 72f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x7, #1, 74f\n"
+ "tbz x2, #1, 74f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal v20.4s, v16.4h, v4.4h\n"
- "smlal2 v0.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v26.4h\n"
- "smlal2 v31.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 77f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 76f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v10.4s, v16.4h, v7.4h\n"
+ "smlal2 v21.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v22.4h\n"
+ "smlal2 v24.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x7, #1, 78f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 78f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ssubl v16.8h, v16.8b, v6.8b\n"
- "smlal v19.4s, v16.4h, v4.4h\n"
- "smlal2 v31.4s, v16.8h, v4.8h\n"
- "tbz x7, #2, 81f\n"
- "ld1 { v14.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x12], #0x10\n"
- "tbz x7, #1, 80f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v12.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x12]\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "smlal v3.4s, v17.4h, v7.4h\n"
+ "smlal2 v24.4s, v17.8h, v7.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v16.4s }, [x7], #0x10\n"
+ "ld1 { v22.4s }, [x8], #0x10\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v0.d }[0], [x7], #0x8\n"
+ "ld1 { v31.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[2], [x7]\n"
+ "ld1 { v31.s }[2], [x8]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v12.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[0], [x7]\n"
+ "ld1 { v31.s }[0], [x8]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 82f\n"
- "ld1 { v14.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x12]\n"
+ "tbz x2, #1, 82f\n"
+ "ld1 { v16.d }[0], [x7], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[2], [x7]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[0], [x7]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v28.16b, v5.16b, v25.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v3.4s, v3.4s, v18.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v16.16b, v3.16b, v12.16b\n"
- "sqrdmulh v21.4s, v21.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v14.4s\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v0.4s\n"
+ "add x16, x16, x4\n"
+ "add x15, x15, x4\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "add x14, x14, x4\n"
+ "add x13, x13, x4\n"
+ "sqrdmulh v3.4s, v3.4s, v16.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v0.4s\n"
+ "and v17.16b, v19.16b, v22.16b\n"
+ "and v16.16b, v1.16b, v31.16b\n"
+ "and v15.16b, v8.16b, v22.16b\n"
+ "and v20.16b, v10.16b, v22.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v14.16b, v21.16b, v25.16b\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v6.16b, v20.16b, v25.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v31.4s, v31.4s, v18.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "and v18.16b, v8.16b, v12.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "and v7.16b, v0.16b, v12.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v16.16b, v31.16b, v12.16b\n"
- "sqadd v21.4s, v21.4s, v14.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v6.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v26.16b, v4.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v21.16b, v31.16b\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "and v17.16b, v3.16b, v22.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v24.16b, v31.16b\n"
+ "sqadd v8.4s, v8.4s, v15.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v25.4s\n"
- "srshl v21.4s, v21.4s, v25.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v25.4s\n"
- "sqadd v0.4s, v0.4s, v7.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v12.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v12.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v12.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v8.4s, v8.4s, v22.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v31.4s\n"
+ "srshl v3.4s, v3.4s, v22.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "srshl v4.4s, v4.4s, v31.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "tbz x7, #2, 85f\n"
- "st1 { v5.s }[0], [x11], #0x4\n"
- "st1 { v21.s }[0], [x10], #0x4\n"
- "st1 { v20.s }[0], [x9], #0x4\n"
- "st1 { v19.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 84f\n"
- "st1 { v5.h }[2], [x11], #0x2\n"
- "st1 { v21.h }[2], [x10], #0x2\n"
- "st1 { v20.h }[2], [x9], #0x2\n"
- "st1 { v19.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[6], [x11], #0x1\n"
- "st1 { v21.b }[6], [x10], #0x1\n"
- "st1 { v20.b }[6], [x9], #0x1\n"
- "st1 { v19.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "tbz x2, #2, 85f\n"
+ "st1 { v19.s }[0], [x16], #0x4\n"
+ "st1 { v8.s }[0], [x15], #0x4\n"
+ "st1 { v10.s }[0], [x14], #0x4\n"
+ "st1 { v3.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "st1 { v19.h }[2], [x16], #0x2\n"
+ "st1 { v8.h }[2], [x15], #0x2\n"
+ "st1 { v10.h }[2], [x14], #0x2\n"
+ "st1 { v3.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[6], [x16], #0x1\n"
+ "st1 { v8.b }[6], [x15], #0x1\n"
+ "st1 { v10.b }[6], [x14], #0x1\n"
+ "st1 { v3.b }[6], [x13], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[4], [x11], #0x1\n"
- "st1 { v21.b }[4], [x10], #0x1\n"
- "st1 { v20.b }[4], [x9], #0x1\n"
- "st1 { v19.b }[4], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[4], [x16], #0x1\n"
+ "st1 { v8.b }[4], [x15], #0x1\n"
+ "st1 { v10.b }[4], [x14], #0x1\n"
+ "st1 { v3.b }[4], [x13], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 86f\n"
- "st1 { v5.h }[0], [x11], #0x2\n"
- "st1 { v21.h }[0], [x10], #0x2\n"
- "st1 { v20.h }[0], [x9], #0x2\n"
- "st1 { v19.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[2], [x11], #0x1\n"
- "st1 { v21.b }[2], [x10], #0x1\n"
- "st1 { v20.b }[2], [x9], #0x1\n"
- "st1 { v19.b }[2], [x28], #0x1\n"
+ "tbz x2, #1, 86f\n"
+ "st1 { v19.h }[0], [x16], #0x2\n"
+ "st1 { v8.h }[0], [x15], #0x2\n"
+ "st1 { v10.h }[0], [x14], #0x2\n"
+ "st1 { v3.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[2], [x16], #0x1\n"
+ "st1 { v8.b }[2], [x15], #0x1\n"
+ "st1 { v10.b }[2], [x14], #0x1\n"
+ "st1 { v3.b }[2], [x13], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[0], [x11], #0x1\n"
- "st1 { v21.b }[0], [x10], #0x1\n"
- "st1 { v20.b }[0], [x9], #0x1\n"
- "st1 { v19.b }[0], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[0], [x16], #0x1\n"
+ "st1 { v8.b }[0], [x15], #0x1\n"
+ "st1 { v10.b }[0], [x14], #0x1\n"
+ "st1 { v3.b }[0], [x13], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index b1648bae14..bd85c150ef 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -113,1743 +113,1743 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x2, x1, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v18.16b }, [x20]\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x14, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v15.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v26.8h }, [x20]\n"
+ "ld1r { v9.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v0.8h }, [x20]\n"
- "mov x3, #0x0\n"
- "mov x4, #0x0\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x16, [x22, #0x0]\n"
- "ldp x15, x14, [x22, #0x10]\n"
- "cbz x2, 3f\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "subs x2, x2, #0x1\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ldr d12, [x6, #0x20]\n"
+ "ld1r { v10.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "ldp x8, x17, [x22, #0x0]\n"
+ "ldp x16, x15, [x22, #0x10]\n"
+ "cbz x14, 3f\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "subs x14, x14, #0x1\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
"add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ldr d31, [x9, x3]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldr d17, [x28, x3]\n"
- "ldr d30, [x27, x3]\n"
- "ssubl v31.8h, v31.8b, v18.8b\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "ldr d16, [x26, x3]\n"
- "ldr d3, [x25, x3]\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "ldr d4, [x24, x3]\n"
- "ldr d25, [x23, x3]\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "ldr d9, [x22, x3]\n"
- "ldr d29, [x21, x3]\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "ldr d28, [x20, x3]\n"
- "ssubl v29.8h, v29.8b, v18.8b\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "ldr d26, [x22, x2]\n"
+ "ldr d29, [x21, x2]\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ldr d18, [x20, x2]\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr d2, [x6, #0x28]\n"
- "ldr d27, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d1, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "ssubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x21, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x20, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "ldr x20, [x5, #0x90]\n"
- "ldr x23, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x21, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x22, [x5, #0xa0]\n"
- "ldr x21, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v2.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v27.4h\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x20, x3]\n"
- "smlal v20.4s, v16.4h, v2.4h\n"
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal v23.4s, v14.4h, v2.4h\n"
- "ldr x20, [x5, #0xb0]\n"
- "ldr x13, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v27.8h\n"
- "smlal v7.4s, v4.4h, v1.4h\n"
- "ldr x12, [x5, #0xc0]\n"
- "ldr x11, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v2.8h\n"
- "ldr d16, [x23, x3]\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v2.8h\n"
- "ldr d2, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v27.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v27.4h\n"
- "smlal v23.4s, v25.4h, v27.4h\n"
- "ldr x10, [x5, #0xd0]\n"
- "ldr x9, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v1.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x28, [x5, #0xe0]\n"
- "ldr x27, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v27.8h\n"
- "ldr d4, [x22, x3]\n"
- "smlal2 v22.4s, v14.8h, v27.8h\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v27.8h\n"
- "ldr d27, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v1.4h\n"
- "smlal v23.4s, v10.4h, v1.4h\n"
- "ldr x26, [x5, #0xf0]\n"
- "ldr x25, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x24, [x5, #0x100]\n"
- "ldr x23, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v1.8h\n"
- "ldr d17, [x21, x3]\n"
- "smlal2 v22.4s, v25.8h, v1.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v1.8h\n"
- "ldr d1, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x22, [x5, #0x110]\n"
- "ldr x21, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "subs x2, x2, #0x1\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr d3, [x5, #0x28]\n"
+ "ldr d2, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d4, [x5, #0x38]\n"
+ "ldr d22, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x23, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x28, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "ldr x27, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ldr x26, [x4, #0x80]\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x25, [x4, #0x88]\n"
+ "ldr x24, [x4, #0x90]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x23, [x4, #0x98]\n"
+ "ldr x22, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x21, x2]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x20, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x21, [x4, #0xa8]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x28, x2]\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x27, x2]\n"
+ "ldr x13, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x12, [x4, #0xc0]\n"
+ "ldr x11, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v3.4h\n"
+ "smlal2 v0.4s, v16.8h, v3.8h\n"
+ "ldr d16, [x25, x2]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x24, x2]\n"
+ "ldr x10, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal v1.4s, v18.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
+ "smlal2 v25.4s, v18.8h, v3.8h\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x28, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v2.4h\n"
+ "smlal2 v0.4s, v20.8h, v2.8h\n"
+ "ldr d20, [x23, x2]\n"
+ "ssubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "ldr d3, [x5, #0x60]\n"
+ "ldr x27, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v2.4h\n"
+ "smlal v1.4s, v17.4h, v2.4h\n"
+ "ldr x26, [x4, #0xf0]\n"
+ "ldr x25, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v2.8h\n"
+ "smlal2 v25.4s, v17.8h, v2.8h\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x24, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "ldr d19, [x22, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v2.4h\n"
+ "smlal2 v30.4s, v26.8h, v2.8h\n"
+ "ldr d2, [x5, #0x68]\n"
+ "ldr x23, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v4.4h\n"
+ "smlal v1.4s, v26.4h, v4.4h\n"
+ "ldr x22, [x4, #0x110]\n"
+ "subs x14, x14, #0x1\n"
+ "smlal2 v6.4s, v12.8h, v4.8h\n"
+ "smlal2 v25.4s, v26.8h, v4.8h\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v22.4h\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v11.8h, v4.8h\n"
+ "ldr d4, [x5, #0x70]\n"
+ "ldr x21, [x4, #0x118]\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal v1.4s, v11.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal2 v25.4s, v11.8h, v22.8h\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "ldr d22, [x5, #0x78]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x13, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x12, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x11, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "ssubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v10.4h, v27.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x10, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v2.4h\n"
- "smlal v23.4s, v17.4h, v2.4h\n"
- "smlal2 v15.4s, v10.8h, v27.8h\n"
- "smlal v7.4s, v9.4h, v1.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "ldr d10, [x9, x3]\n"
- "smlal2 v22.4s, v4.8h, v2.8h\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v2.8h\n"
- "ldr d2, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v27.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v27.4h\n"
- "smlal v23.4s, v6.4h, v27.4h\n"
- "smlal2 v15.4s, v9.8h, v1.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v27.8h\n"
- "ldr d9, [x28, x3]\n"
- "smlal2 v22.4s, v17.8h, v27.8h\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v27.8h\n"
- "ldr d27, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v1.4h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x27, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v1.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v1.8h\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "ldr d1, [x26, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "ssubl v1.8h, v1.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x25, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x24, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v2.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x23, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "add x6, x6, #0xc8\n"
- "smlal2 v15.4s, v6.8h, v2.8h\n"
- "smlal v7.4s, v8.4h, v27.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x22, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal v20.4s, v28.4h, v2.4h\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v2.4h\n"
- "smlal v23.4s, v12.4h, v2.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v27.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v2.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v2.8h\n"
- "smlal2 v19.4s, v12.8h, v2.8h\n"
- "ldr q2, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v27.4h\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x13, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x12, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x11, x2]\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v3.4h\n"
+ "smlal2 v0.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x10, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v3.4h\n"
+ "smlal v1.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v11.8h, v3.8h\n"
+ "smlal2 v25.4s, v19.8h, v3.8h\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v2.4h\n"
+ "smlal2 v0.4s, v11.8h, v2.8h\n"
+ "ldr d11, [x9, x2]\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v3.4h\n"
+ "smlal2 v30.4s, v12.8h, v3.8h\n"
+ "ldr d3, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v2.4h\n"
+ "smlal v1.4s, v12.4h, v2.4h\n"
+ "smlal2 v6.4s, v28.8h, v2.8h\n"
+ "smlal2 v25.4s, v12.8h, v2.8h\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v0.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v2.4h\n"
+ "smlal2 v30.4s, v7.8h, v2.8h\n"
+ "ldr d2, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v1.4s, v7.4h, v4.4h\n"
+ "smlal2 v6.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x27, x2]\n"
+ "smlal2 v25.4s, v7.8h, v4.8h\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v22.4h\n"
+ "smlal2 v0.4s, v23.8h, v22.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x26, x2]\n"
+ "smlal v27.4s, v20.4h, v22.4h\n"
+ "smlal v1.4s, v24.4h, v22.4h\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v22.8h\n"
+ "smlal2 v25.4s, v24.8h, v22.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x25, x2]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v22.4h\n"
+ "smlal2 v30.4s, v17.8h, v22.8h\n"
+ "ldr d22, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x24, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "add x5, x5, #0xc8\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v3.4h\n"
+ "smlal2 v0.4s, v7.8h, v3.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x21, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v3.4h\n"
+ "smlal v1.4s, v28.4h, v3.4h\n"
+ "smlal2 v6.4s, v29.8h, v3.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v3.8h\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal2 v0.4s, v24.8h, v2.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v3.4h\n"
+ "smlal2 v30.4s, v16.8h, v3.8h\n"
+ "ldr q3, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v2.4h\n"
+ "smlal v1.4s, v4.4h, v2.4h\n"
+ "smlal2 v6.4s, v17.8h, v2.8h\n"
+ "smlal2 v25.4s, v4.8h, v2.8h\n"
+ "ldr q4, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v1.4h, v27.4h\n"
- "smlal v23.4s, v16.4h, v27.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v27.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v1.8h, v27.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v27.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v27.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v27.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v2.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v9.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v25.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v2.4s\n"
- "and v10.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v2.4s\n"
- "and v21.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v2.4s\n"
- "sqadd v15.4s, v15.4s, v9.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v9.16b, v5.16b, v14.16b\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "and v12.16b, v22.16b, v14.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v17.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v10.4s\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v2.4h\n"
+ "smlal2 v30.4s, v20.8h, v2.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal2 v0.4s, v26.8h, v22.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v22.4h\n"
+ "smlal v1.4s, v19.4h, v22.4h\n"
+ "smlal2 v6.4s, v11.8h, v22.8h\n"
+ "smlal2 v25.4s, v19.8h, v22.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v22.4h\n"
+ "smlal2 v30.4s, v12.8h, v22.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v12.16b, v0.16b, v4.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v3.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
+ "and v21.16b, v27.16b, v24.16b\n"
+ "and v16.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v28.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v12.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v25.16b, v4.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v30.16b, v4.16b\n"
+ "sqadd v27.4s, v27.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v12.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v17.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v3.4s\n"
+ "srshl v0.4s, v0.4s, v4.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v4.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "add x4, x4, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldr d31, [x9, x3]\n"
- "ldr d17, [x28, x3]\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr d30, [x27, x3]\n"
- "ldr d16, [x26, x3]\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ssubl v31.8h, v31.8b, v18.8b\n"
- "ldr d3, [x25, x3]\n"
- "ldr d4, [x24, x3]\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "ldr d25, [x23, x3]\n"
- "ldr d9, [x22, x3]\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "ldr d29, [x21, x3]\n"
- "ldr d28, [x20, x3]\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "ssubl v29.8h, v29.8b, v18.8b\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "ldr d26, [x22, x2]\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "ldr d29, [x21, x2]\n"
+ "ldr d18, [x20, x2]\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr d27, [x6, #0x28]\n"
- "ldr d1, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d2, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "ssubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x21, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x21, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v27.4h\n"
- "ldr x23, [x5, #0x90]\n"
- "ldr x22, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x20, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x21, [x5, #0xa0]\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v27.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v1.4h\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x23, x3]\n"
- "smlal v20.4s, v16.4h, v27.4h\n"
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v27.4h\n"
- "smlal v23.4s, v14.4h, v27.4h\n"
- "ldr x13, [x5, #0xb0]\n"
- "ldr x12, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v1.8h\n"
- "smlal v7.4s, v4.4h, v2.4h\n"
- "ldr x11, [x5, #0xc0]\n"
- "ldr x10, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v27.8h\n"
- "ldr d16, [x22, x3]\n"
- "smlal2 v22.4s, v28.8h, v27.8h\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v27.8h\n"
- "ldr d27, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v1.4h\n"
- "smlal v23.4s, v25.4h, v1.4h\n"
- "ldr x9, [x5, #0xd0]\n"
- "ldr x28, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v2.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x27, [x5, #0xe0]\n"
- "ldr x26, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v1.8h\n"
- "ldr d4, [x21, x3]\n"
- "smlal2 v22.4s, v14.8h, v1.8h\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "ldr d1, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v2.4h\n"
- "smlal v23.4s, v10.4h, v2.4h\n"
- "ldr x25, [x5, #0xf0]\n"
- "ldr x24, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x23, [x5, #0x100]\n"
- "ldr x22, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v2.8h\n"
- "ldr d17, [x20, x3]\n"
- "smlal2 v22.4s, v25.8h, v2.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v2.8h\n"
- "ldr d2, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x21, [x5, #0x110]\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
+ "ldr d4, [x5, #0x28]\n"
+ "ldr d3, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "ldr d2, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x21, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x28, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x27, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x24, [x4, #0x80]\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x23, [x4, #0x88]\n"
+ "ldr x22, [x4, #0x90]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x27, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x14, [x4, #0xa8]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x13, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x26, x2]\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x25, x2]\n"
+ "ldr x12, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "ldr x10, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x24, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x23, x2]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x22, x2]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal v1.4s, v18.4h, v4.4h\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v4.8h\n"
+ "smlal2 v25.4s, v18.8h, v4.8h\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "ldr d20, [x21, x2]\n"
+ "ssubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v4.4h\n"
+ "smlal2 v30.4s, v17.8h, v4.8h\n"
+ "ldr d4, [x5, #0x60]\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal v1.4s, v17.4h, v3.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal2 v25.4s, v17.8h, v3.8h\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "ldr d19, [x20, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v3.4h\n"
+ "smlal2 v30.4s, v26.8h, v3.8h\n"
+ "ldr d3, [x5, #0x68]\n"
+ "ldr x22, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v22.4h\n"
+ "smlal v1.4s, v26.4h, v22.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v6.4s, v12.8h, v22.8h\n"
+ "smlal2 v25.4s, v26.8h, v22.8h\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
"tst x1, #0x7\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x13, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x12, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x11, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v27.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x10, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "ssubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v27.8h\n"
- "smlal v7.4s, v10.4h, v1.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x9, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v27.4h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v27.4h\n"
- "smlal v23.4s, v17.4h, v27.4h\n"
- "smlal2 v15.4s, v10.8h, v1.8h\n"
- "smlal v7.4s, v9.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v27.8h\n"
- "ldr d10, [x28, x3]\n"
- "smlal2 v22.4s, v4.8h, v27.8h\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v27.8h\n"
- "ldr d27, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v1.4h\n"
- "smlal v23.4s, v6.4h, v1.4h\n"
- "smlal2 v15.4s, v9.8h, v2.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v1.8h\n"
- "ldr d9, [x27, x3]\n"
- "smlal2 v22.4s, v17.8h, v1.8h\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v1.8h\n"
- "ldr d1, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v2.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x26, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v2.8h\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v2.8h\n"
- "ldr d2, [x25, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "ssubl v2.8h, v2.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x24, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x23, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v27.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "smlal2 v15.4s, v6.8h, v27.8h\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x21, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x20, x3]\n"
- "smlal v20.4s, v28.4h, v27.4h\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v27.4h\n"
- "smlal v23.4s, v12.4h, v27.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v1.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v27.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v27.8h\n"
- "smlal2 v19.4s, v12.8h, v27.8h\n"
- "ldr q27, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v1.4h\n"
+ "smlal v8.4s, v12.4h, v2.4h\n"
+ "smlal2 v0.4s, v12.8h, v2.8h\n"
+ "ldr d12, [x14, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "smlal v27.4s, v7.4h, v2.4h\n"
+ "smlal v1.4s, v11.4h, v2.4h\n"
+ "smlal2 v6.4s, v7.8h, v2.8h\n"
+ "smlal2 v25.4s, v11.8h, v2.8h\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x13, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v2.4h\n"
+ "smlal2 v30.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x5, #0x78]\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x12, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x11, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x10, x2]\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v0.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x9, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v4.4h\n"
+ "smlal v1.4s, v19.4h, v4.4h\n"
+ "smlal2 v6.4s, v11.8h, v4.8h\n"
+ "smlal2 v25.4s, v19.8h, v4.8h\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v0.4s, v11.8h, v3.8h\n"
+ "ldr d11, [x28, x2]\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v4.4h\n"
+ "smlal2 v30.4s, v12.8h, v4.8h\n"
+ "ldr d4, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v3.4h\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "ldr d28, [x27, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v3.4h\n"
+ "smlal2 v30.4s, v7.8h, v3.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v22.4h\n"
+ "smlal v1.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v16.8h, v22.8h\n"
+ "ldr d16, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v22.8h\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal2 v0.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v22.4h\n"
+ "smlal2 v30.4s, v29.8h, v22.8h\n"
+ "ldr d22, [x25, x2]\n"
+ "smlal v27.4s, v20.4h, v2.4h\n"
+ "smlal v1.4s, v24.4h, v2.4h\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v2.8h\n"
+ "smlal2 v25.4s, v24.8h, v2.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x24, x2]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v2.4h\n"
+ "smlal2 v30.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x23, x2]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x22, x2]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v4.4h\n"
+ "smlal2 v0.4s, v7.8h, v4.8h\n"
+ "ldr d7, [x21, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x20, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal v1.4s, v28.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v0.4s, v24.8h, v3.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
+ "ldr q4, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v3.4h\n"
+ "smlal v1.4s, v22.4h, v3.4h\n"
+ "smlal2 v6.4s, v17.8h, v3.8h\n"
+ "smlal2 v25.4s, v22.8h, v3.8h\n"
+ "ldr q22, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v2.4h, v1.4h\n"
- "smlal v23.4s, v16.4h, v1.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v1.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v2.8h, v1.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v1.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v4.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v4.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v27.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v30.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v3.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v27.4s\n"
- "and v25.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v27.4s\n"
- "and v16.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v27.4s\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v4.16b, v5.16b, v14.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v10.16b, v22.16b, v14.16b\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v3.4h\n"
+ "smlal2 v30.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal2 v0.4s, v26.8h, v2.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v2.4h\n"
+ "smlal v1.4s, v19.4h, v2.4h\n"
+ "smlal2 v6.4s, v11.8h, v2.8h\n"
+ "smlal2 v25.4s, v19.8h, v2.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v2.4h\n"
+ "smlal2 v30.4s, v12.8h, v2.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v4.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v28.16b, v0.16b, v22.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v16.16b, v27.16b, v24.16b\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v11.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v28.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v25.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
+ "and v18.16b, v6.16b, v22.16b\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v4.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v12.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "add x4, x4, #0x8\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v11.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
"beq 124f\n"
- "add x6, x6, #0xc8\n"
+ "add x5, x5, #0xc8\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v7.4s }, [x20], #0x10\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v0.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[2], [x20]\n"
+ "ld1 { v0.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[0], [x20]\n"
+ "ld1 { v0.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[2], [x20]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[0], [x20]\n"
+ "ld1 { v8.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "add x9, x9, x3\n"
- "add x28, x28, x3\n"
- "add x27, x27, x3\n"
- "add x26, x26, x3\n"
- "add x25, x25, x3\n"
- "add x24, x24, x3\n"
- "add x23, x23, x3\n"
- "add x22, x22, x3\n"
- "add x21, x21, x3\n"
- "add x20, x20, x3\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "add x9, x9, x2\n"
+ "add x28, x28, x2\n"
+ "add x27, x27, x2\n"
+ "add x26, x26, x2\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "add x25, x25, x2\n"
+ "add x24, x24, x2\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "add x23, x23, x2\n"
+ "add x22, x22, x2\n"
+ "add x21, x21, x2\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 9f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v17.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "ld1 { v16.s }[0], [x26], #0x4\n"
- "ld1 { v3.s }[0], [x25], #0x4\n"
- "ld1 { v4.s }[0], [x24], #0x4\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "ld1 { v21.s }[0], [x28], #0x4\n"
+ "ld1 { v16.s }[0], [x27], #0x4\n"
+ "ld1 { v20.s }[0], [x26], #0x4\n"
+ "ld1 { v7.s }[0], [x25], #0x4\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
"ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v17.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v16.h }[2], [x26], #0x2\n"
- "ld1 { v3.h }[2], [x25], #0x2\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v21.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v20.h }[2], [x26], #0x2\n"
+ "ld1 { v7.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
"ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[6], [x9]\n"
- "ld1 { v17.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v16.b }[6], [x26]\n"
- "ld1 { v3.b }[6], [x25]\n"
- "ld1 { v4.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "ld1 { v21.b }[6], [x28]\n"
+ "ld1 { v16.b }[6], [x27]\n"
+ "ld1 { v20.b }[6], [x26]\n"
+ "ld1 { v7.b }[6], [x25]\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[4], [x9]\n"
- "ld1 { v17.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v16.b }[4], [x26]\n"
- "ld1 { v3.b }[4], [x25]\n"
- "ld1 { v4.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "ld1 { v21.b }[4], [x28]\n"
+ "ld1 { v16.b }[4], [x27]\n"
+ "ld1 { v20.b }[4], [x26]\n"
+ "ld1 { v7.b }[4], [x25]\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
"ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v17.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "ld1 { v16.h }[0], [x26], #0x2\n"
- "ld1 { v3.h }[0], [x25], #0x2\n"
- "ld1 { v4.h }[0], [x24], #0x2\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "ld1 { v21.h }[0], [x28], #0x2\n"
+ "ld1 { v16.h }[0], [x27], #0x2\n"
+ "ld1 { v20.h }[0], [x26], #0x2\n"
+ "ld1 { v7.h }[0], [x25], #0x2\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
"ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[2], [x9]\n"
- "ld1 { v17.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v16.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "ld1 { v21.b }[2], [x28]\n"
+ "ld1 { v16.b }[2], [x27]\n"
+ "ld1 { v20.b }[2], [x26]\n"
+ "ld1 { v7.b }[2], [x25]\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[0], [x9]\n"
- "ld1 { v17.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
- "ld1 { v16.b }[0], [x26]\n"
- "ld1 { v3.b }[0], [x25]\n"
- "ld1 { v4.b }[0], [x24]\n"
- "ld1 { v25.b }[0], [x23]\n"
- "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "ld1 { v21.b }[0], [x28]\n"
+ "ld1 { v16.b }[0], [x27]\n"
+ "ld1 { v20.b }[0], [x26]\n"
+ "ld1 { v7.b }[0], [x25]\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
"ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ssubl v31.8h, v31.8b, v18.8b\n"
- "ssubl v17.8h, v17.8b, v18.8b\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "ldr x20, [x5, #0x50]\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "add x20, x20, x3\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "ssubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "ssubl v29.8h, v29.8b, v18.8b\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x50]\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "add x20, x20, x2\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v4.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v4.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v4.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v4.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ssubl v27.8h, v27.8b, v18.8b\n"
- "ldr x20, [x5, #0x58]\n"
- "smlal v23.4s, v27.4h, v10.4h\n"
- "smlal2 v19.4s, v27.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "smlal v24.4s, v27.4h, v21.4h\n"
- "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal v5.4s, v4.4h, v17.4h\n"
+ "smlal2 v30.4s, v4.8h, v17.8h\n"
+ "smlal v1.4s, v4.4h, v11.4h\n"
+ "smlal2 v25.4s, v4.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 17f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ssubl v6.8h, v6.8b, v18.8b\n"
- "ldr x20, [x5, #0x60]\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x60]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "smlal v5.4s, v21.4h, v11.4h\n"
+ "smlal2 v30.4s, v21.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 21f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d14, [x6, #0x28]\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v14.4h\n"
- "smlal2 v15.4s, v30.8h, v14.8h\n"
- "smlal v20.4s, v16.4h, v14.4h\n"
- "smlal2 v5.4s, v16.8h, v14.8h\n"
- "smlal v24.4s, v28.4h, v14.4h\n"
- "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "ldr d11, [x5, #0x28]\n"
+ "ssubl v31.8h, v31.8b, v15.8b\n"
+ "smlal v1.4s, v21.4h, v23.4h\n"
+ "smlal2 v25.4s, v21.8h, v23.8h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "smlal v27.4s, v31.4h, v23.4h\n"
+ "smlal2 v6.4s, v31.8h, v23.8h\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "smlal v1.4s, v18.4h, v11.4h\n"
+ "smlal2 v25.4s, v18.8h, v11.8h\n"
+ "smlal v27.4s, v20.4h, v11.4h\n"
+ "smlal2 v6.4s, v20.8h, v11.8h\n"
"tbz x1, #2, 25f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x1, #1, 26f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d21, [x6, #0x30]\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x70]\n"
- "smlal v23.4s, v25.4h, v14.4h\n"
- "smlal2 v19.4s, v25.8h, v14.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v16.8h, v21.8h\n"
- "smlal v20.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v4.8h, v21.8h\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ldr d3, [x5, #0x30]\n"
+ "ssubl v24.8h, v24.8b, v15.8b\n"
+ "ldr x20, [x4, #0x70]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v24.4h, v11.4h\n"
+ "smlal2 v30.4s, v24.8h, v11.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal v1.4s, v24.4h, v3.4h\n"
+ "smlal2 v25.4s, v24.8h, v3.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d9, [x6, #0x38]\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v4.4h, v9.4h\n"
- "smlal2 v15.4s, v4.8h, v9.8h\n"
- "smlal v20.4s, v27.4h, v9.4h\n"
- "smlal2 v5.4s, v27.8h, v9.8h\n"
- "smlal v24.4s, v10.4h, v9.4h\n"
- "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x20, [x4, #0x78]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v2.4h, v3.4h\n"
+ "smlal2 v30.4s, v2.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "smlal v27.4s, v4.4h, v22.4h\n"
+ "smlal2 v6.4s, v4.8h, v22.8h\n"
+ "smlal v1.4s, v2.4h, v22.4h\n"
+ "smlal2 v25.4s, v2.8h, v22.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v12.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v12.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v12.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d31, [x6, #0x40]\n"
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal v23.4s, v12.4h, v9.4h\n"
- "smlal2 v19.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v27.4h, v31.4h\n"
- "smlal2 v15.4s, v27.8h, v31.8h\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "smlal v24.4s, v12.4h, v31.4h\n"
- "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "ldr d31, [x5, #0x40]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0x80]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v22.4h\n"
+ "smlal2 v30.4s, v26.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v4.4h, v31.4h\n"
+ "smlal2 v0.4s, v4.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d16, [x6, #0x48]\n"
- "ssubl v8.8h, v8.8b, v18.8b\n"
- "ssubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0x88]\n"
- "smlal v23.4s, v8.4h, v31.4h\n"
- "smlal2 v19.4s, v8.8h, v31.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v6.4h, v16.4h\n"
- "smlal2 v15.4s, v6.8h, v16.8h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal2 v5.4s, v29.8h, v16.8h\n"
- "smlal v24.4s, v8.4h, v16.4h\n"
- "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "ldr d17, [x5, #0x48]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0x88]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v31.4h\n"
+ "smlal2 v30.4s, v28.8h, v31.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v29.4h, v17.4h\n"
+ "smlal2 v6.4s, v29.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v7.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v7.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v7.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v7.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v7.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v7.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d21, [x6, #0x50]\n"
- "ssubl v27.8h, v27.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x90]\n"
- "smlal v23.4s, v27.4h, v16.4h\n"
- "smlal2 v19.4s, v27.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "smlal v20.4s, v25.4h, v21.4h\n"
- "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "ldr d22, [x5, #0x50]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr x20, [x4, #0x90]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v17.4h\n"
+ "smlal2 v30.4s, v7.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v22.4h\n"
+ "smlal2 v0.4s, v18.8h, v22.8h\n"
+ "smlal v27.4s, v24.4h, v22.4h\n"
+ "smlal2 v6.4s, v24.8h, v22.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ssubl v31.8h, v31.8b, v18.8b\n"
- "ldr x20, [x5, #0x98]\n"
- "smlal v24.4s, v31.4h, v21.4h\n"
- "smlal2 v22.4s, v31.8h, v21.8h\n"
- "add x20, x20, x3\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0x98]\n"
+ "smlal v1.4s, v20.4h, v22.4h\n"
+ "smlal2 v25.4s, v20.8h, v22.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 49f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d2, [x6, #0x58]\n"
- "ssubl v28.8h, v28.8b, v18.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0xa0]\n"
- "smlal v23.4s, v28.4h, v21.4h\n"
- "smlal2 v19.4s, v28.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ldr d17, [x5, #0x58]\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v22.4h\n"
+ "smlal2 v30.4s, v19.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v24.4h, v17.4h\n"
+ "smlal2 v0.4s, v24.8h, v17.8h\n"
+ "smlal v27.4s, v2.4h, v17.4h\n"
+ "smlal2 v6.4s, v2.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d25, [x6, #0x60]\n"
- "ssubl v21.8h, v21.8b, v18.8b\n"
- "ssubl v25.8h, v25.8b, v13.8b\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal v23.4s, v21.4h, v2.4h\n"
- "smlal2 v19.4s, v21.8h, v2.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v10.4h, v25.4h\n"
- "smlal2 v15.4s, v10.8h, v25.8h\n"
- "smlal v20.4s, v12.4h, v25.4h\n"
- "smlal2 v5.4s, v12.8h, v25.8h\n"
- "smlal v24.4s, v21.4h, v25.4h\n"
- "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "ldr d24, [x5, #0x60]\n"
+ "ssubl v29.8h, v29.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa8]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v17.4h\n"
+ "smlal2 v30.4s, v29.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v2.4h, v24.4h\n"
+ "smlal2 v0.4s, v2.8h, v24.8h\n"
+ "smlal v27.4s, v26.4h, v24.4h\n"
+ "smlal2 v6.4s, v26.8h, v24.8h\n"
+ "smlal v1.4s, v29.4h, v24.4h\n"
+ "smlal2 v25.4s, v29.8h, v24.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d1, [x6, #0x68]\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0xb0]\n"
- "smlal v23.4s, v9.4h, v25.4h\n"
- "smlal2 v19.4s, v9.8h, v25.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v12.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v1.8h\n"
- "smlal v20.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v8.8h, v1.8h\n"
- "smlal v24.4s, v9.4h, v1.4h\n"
- "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "ldr d17, [x5, #0x68]\n"
+ "ssubl v31.8h, v31.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v31.4h, v24.4h\n"
+ "smlal2 v30.4s, v31.8h, v24.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v17.4h\n"
+ "smlal2 v0.4s, v26.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v31.4h, v17.4h\n"
+ "smlal2 v25.4s, v31.8h, v17.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v3.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v3.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v3.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d16, [x6, #0x70]\n"
- "ssubl v3.8h, v3.8b, v18.8b\n"
- "ssubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0xb8]\n"
- "smlal v23.4s, v3.4h, v1.4h\n"
- "smlal2 v19.4s, v3.8h, v1.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "smlal2 v15.4s, v8.8h, v16.8h\n"
- "smlal v20.4s, v27.4h, v16.4h\n"
- "smlal2 v5.4s, v27.8h, v16.8h\n"
- "smlal v24.4s, v3.4h, v16.4h\n"
- "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "ssubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v21.4h, v17.4h\n"
+ "smlal2 v30.4s, v21.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal v1.4s, v21.4h, v22.4h\n"
+ "smlal2 v25.4s, v21.8h, v22.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v14.s }[0], [x20], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v14.h }[2], [x20], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[6], [x20]\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[4], [x20]\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v14.h }[0], [x20], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[2], [x20]\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[0], [x20]\n"
+ "ld1 { v11.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d17, [x6, #0x78]\n"
- "ssubl v14.8h, v14.8b, v18.8b\n"
- "ssubl v17.8h, v17.8b, v13.8b\n"
- "ldr x20, [x5, #0xc0]\n"
- "smlal v23.4s, v14.4h, v16.4h\n"
- "smlal2 v19.4s, v14.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v31.4h, v17.4h\n"
- "smlal2 v15.4s, v31.8h, v17.8h\n"
- "smlal v20.4s, v28.4h, v17.4h\n"
- "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "ldr d17, [x5, #0x78]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v17.4h\n"
+ "smlal2 v0.4s, v20.8h, v17.8h\n"
+ "smlal v27.4s, v19.4h, v17.4h\n"
+ "smlal2 v6.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v1.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v1.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v1.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ssubl v1.8h, v1.8b, v18.8b\n"
- "ldr x20, [x5, #0xc8]\n"
- "smlal v24.4s, v1.4h, v17.4h\n"
- "smlal2 v22.4s, v1.8h, v17.8h\n"
- "add x20, x20, x3\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc8]\n"
+ "smlal v1.4s, v18.4h, v17.4h\n"
+ "smlal2 v25.4s, v18.8h, v17.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 73f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d29, [x6, #0x80]\n"
- "ssubl v16.8h, v16.8b, v18.8b\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "ldr x20, [x5, #0xd0]\n"
- "smlal v23.4s, v16.4h, v17.4h\n"
- "smlal2 v19.4s, v16.8h, v17.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v29.4h\n"
- "smlal2 v15.4s, v28.8h, v29.8h\n"
- "smlal v20.4s, v21.4h, v29.4h\n"
- "smlal2 v5.4s, v21.8h, v29.8h\n"
- "smlal v24.4s, v16.4h, v29.4h\n"
- "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "ldr d4, [x5, #0x80]\n"
+ "ssubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd0]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v20.4h, v17.4h\n"
+ "smlal2 v30.4s, v20.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "smlal v1.4s, v20.4h, v4.4h\n"
+ "smlal2 v25.4s, v20.8h, v4.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d12, [x6, #0x88]\n"
- "ssubl v30.8h, v30.8b, v18.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0xd8]\n"
- "smlal v23.4s, v30.4h, v29.4h\n"
- "smlal2 v19.4s, v30.8h, v29.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v21.4h, v12.4h\n"
- "smlal2 v15.4s, v21.8h, v12.8h\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v30.4h, v12.4h\n"
- "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "ldr d17, [x5, #0x88]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd8]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v4.4h\n"
+ "smlal2 v30.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v29.4h, v17.4h\n"
+ "smlal2 v0.4s, v29.8h, v17.8h\n"
+ "smlal v27.4s, v31.4h, v17.4h\n"
+ "smlal2 v6.4s, v31.8h, v17.8h\n"
+ "smlal v1.4s, v26.4h, v17.4h\n"
+ "smlal2 v25.4s, v26.8h, v17.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d21, [x6, #0x90]\n"
- "ssubl v29.8h, v29.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0xe0]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal v20.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v3.8h, v21.8h\n"
- "smlal v24.4s, v29.4h, v21.4h\n"
- "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "ldr d22, [x5, #0x90]\n"
+ "ssubl v23.8h, v23.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe0]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v31.4h, v22.4h\n"
+ "smlal2 v0.4s, v31.8h, v22.8h\n"
+ "smlal v27.4s, v21.4h, v22.4h\n"
+ "smlal2 v6.4s, v21.8h, v22.8h\n"
+ "smlal v1.4s, v23.4h, v22.4h\n"
+ "smlal2 v25.4s, v23.8h, v22.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d8, [x6, #0x98]\n"
- "ssubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0xe8]\n"
- "smlal v23.4s, v25.4h, v21.4h\n"
- "smlal2 v19.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v3.4h, v8.4h\n"
- "smlal2 v15.4s, v3.8h, v8.8h\n"
- "smlal v20.4s, v14.4h, v8.4h\n"
- "smlal2 v5.4s, v14.8h, v8.8h\n"
- "smlal v24.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "ldr d17, [x5, #0x98]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe8]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v11.4h, v17.4h\n"
+ "smlal2 v6.4s, v11.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d9, [x6, #0xa0]\n"
- "ssubl v21.8h, v21.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0xf0]\n"
- "smlal v23.4s, v21.4h, v8.4h\n"
- "smlal2 v19.4s, v21.8h, v8.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v1.4h, v9.4h\n"
- "smlal2 v15.4s, v1.8h, v9.8h\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "ssubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf0]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v17.4h\n"
+ "smlal2 v30.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v3.4h\n"
+ "smlal2 v0.4s, v18.8h, v3.8h\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
"tbz x1, #2, 93f\n"
"ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
@@ -1871,308 +1871,308 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 95f\n"
"ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ssubl v12.8h, v12.8b, v18.8b\n"
- "ldr x20, [x5, #0xf8]\n"
- "smlal v24.4s, v12.4h, v9.4h\n"
- "smlal2 v22.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
+ "ssubl v12.8h, v12.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf8]\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 97f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d12, [x6, #0xa8]\n"
- "ssubl v10.8h, v10.8b, v18.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0x100]\n"
- "smlal v23.4s, v10.4h, v9.4h\n"
- "smlal2 v19.4s, v10.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v12.4h\n"
- "smlal2 v15.4s, v16.8h, v12.8h\n"
- "smlal v20.4s, v30.4h, v12.4h\n"
- "smlal2 v5.4s, v30.8h, v12.8h\n"
- "smlal v24.4s, v10.4h, v12.4h\n"
- "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "ldr d18, [x5, #0xa8]\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x100]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "smlal v27.4s, v26.4h, v18.4h\n"
+ "smlal2 v6.4s, v26.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d28, [x6, #0xb0]\n"
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "ssubl v28.8h, v28.8b, v13.8b\n"
- "ldr x20, [x5, #0x108]\n"
- "smlal v23.4s, v9.4h, v12.4h\n"
- "smlal2 v19.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v28.4h\n"
- "smlal2 v15.4s, v30.8h, v28.8h\n"
- "smlal v20.4s, v29.4h, v28.4h\n"
- "smlal2 v5.4s, v29.8h, v28.8h\n"
- "smlal v24.4s, v9.4h, v28.4h\n"
- "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "ldr d12, [x5, #0xb0]\n"
+ "ssubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0x108]\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v18.4h\n"
+ "smlal2 v30.4s, v19.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v12.4h\n"
+ "smlal2 v0.4s, v26.8h, v12.8h\n"
+ "smlal v27.4s, v23.4h, v12.4h\n"
+ "smlal2 v6.4s, v23.8h, v12.8h\n"
+ "smlal v1.4s, v19.4h, v12.4h\n"
+ "smlal2 v25.4s, v19.8h, v12.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v2.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v2.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v2.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d30, [x6, #0xb8]\n"
- "ssubl v2.8h, v2.8b, v18.8b\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "ldr x20, [x5, #0x110]\n"
- "smlal v23.4s, v2.4h, v28.4h\n"
- "smlal2 v19.4s, v2.8h, v28.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v29.4h, v30.4h\n"
- "smlal2 v15.4s, v29.8h, v30.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal v24.4s, v2.4h, v30.4h\n"
- "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x110]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v12.4h\n"
+ "smlal2 v30.4s, v17.8h, v12.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v23.4h, v18.4h\n"
+ "smlal2 v0.4s, v23.8h, v18.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d8, [x6, #0xc0]\n"
- "ssubl v27.8h, v27.8b, v18.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal v23.4s, v27.4h, v30.4h\n"
- "smlal2 v19.4s, v27.8h, v30.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v8.4h\n"
- "smlal2 v15.4s, v25.8h, v8.8h\n"
- "smlal v20.4s, v21.4h, v8.4h\n"
- "smlal2 v5.4s, v21.8h, v8.8h\n"
- "smlal v24.4s, v27.4h, v8.4h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d26, [x5, #0xc0]\n"
+ "ssubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v5.4s, v3.4h, v18.4h\n"
+ "smlal2 v30.4s, v3.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v26.4h\n"
+ "smlal2 v0.4s, v28.8h, v26.8h\n"
+ "smlal v27.4s, v16.4h, v26.4h\n"
+ "smlal2 v6.4s, v16.8h, v26.8h\n"
+ "smlal v1.4s, v3.4h, v26.4h\n"
+ "smlal2 v25.4s, v3.8h, v26.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ssubl v9.8h, v9.8b, v18.8b\n"
- "smlal v23.4s, v9.4h, v8.4h\n"
- "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "ssubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v26.4h\n"
+ "smlal2 v30.4s, v17.8h, v26.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v30.4s }, [x7], #0x10\n"
- "ld1 { v12.4s }, [x8], #0x10\n"
+ "ld1 { v9.4s }, [x6], #0x10\n"
+ "ld1 { v20.4s }, [x7], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v14.d }[0], [x7], #0x8\n"
- "ld1 { v27.d }[0], [x8], #0x8\n"
+ "ld1 { v18.d }[0], [x6], #0x8\n"
+ "ld1 { v3.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[2], [x7]\n"
- "ld1 { v27.s }[2], [x8]\n"
+ "ld1 { v18.s }[2], [x6]\n"
+ "ld1 { v3.s }[2], [x7]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[0], [x7]\n"
- "ld1 { v27.s }[0], [x8]\n"
+ "ld1 { v18.s }[0], [x6]\n"
+ "ld1 { v3.s }[0], [x7]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v30.d }[0], [x7], #0x8\n"
- "ld1 { v12.d }[0], [x8], #0x8\n"
+ "ld1 { v9.d }[0], [x6], #0x8\n"
+ "ld1 { v20.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[2], [x7]\n"
- "ld1 { v12.s }[2], [x8]\n"
+ "ld1 { v9.s }[2], [x6]\n"
+ "ld1 { v20.s }[2], [x7]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[0], [x7]\n"
- "ld1 { v12.s }[0], [x8]\n"
+ "ld1 { v9.s }[0], [x6]\n"
+ "ld1 { v20.s }[0], [x7]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v7.4s, v7.4s, v30.4s\n"
- "and v16.16b, v7.16b, v12.16b\n"
- "add x17, x17, x4\n"
- "add x16, x16, x4\n"
- "sqrdmulh v15.4s, v15.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add x15, x15, x4\n"
- "add x14, x14, x4\n"
- "and v2.16b, v15.16b, v27.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "sqrdmulh v24.4s, v24.4s, v30.4s\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "sqadd v7.4s, v7.4s, v16.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v21.16b, v20.16b, v12.16b\n"
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v18.16b, v24.16b, v12.16b\n"
- "sqrdmulh v22.4s, v22.4s, v14.4s\n"
- "and v31.16b, v23.16b, v12.16b\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v9.16b, v5.16b, v27.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v22.16b, v27.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v28.16b, v19.16b, v27.16b\n"
- "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v9.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "add x8, x8, x3\n"
+ "add x17, x17, x3\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v9.4s\n"
+ "add x16, x16, x3\n"
+ "add x15, x15, x3\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "and v17.16b, v8.16b, v20.16b\n"
+ "and v23.16b, v0.16b, v3.16b\n"
+ "and v9.16b, v27.16b, v20.16b\n"
+ "and v26.16b, v1.16b, v20.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
"sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v31.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v12.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v12.4s\n"
- "sqadd v22.4s, v22.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "srshl v15.4s, v15.4s, v27.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v27.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v27.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v27.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "and v24.16b, v6.16b, v3.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v18.16b, v25.16b, v3.16b\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v17.16b, v5.16b, v20.16b\n"
+ "sqadd v0.4s, v0.4s, v23.4s\n"
+ "and v16.16b, v30.16b, v3.16b\n"
+ "sqadd v27.4s, v27.4s, v9.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v26.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v24.4s\n"
+ "srshl v1.4s, v1.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "srshl v5.4s, v5.4s, v20.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v3.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v7.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x16], #0x4\n"
- "st1 { v24.s }[0], [x15], #0x4\n"
- "st1 { v23.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x8], #0x4\n"
+ "st1 { v27.s }[0], [x17], #0x4\n"
+ "st1 { v1.s }[0], [x16], #0x4\n"
+ "st1 { v5.s }[0], [x15], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v7.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x16], #0x2\n"
- "st1 { v24.h }[2], [x15], #0x2\n"
- "st1 { v23.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x8], #0x2\n"
+ "st1 { v27.h }[2], [x17], #0x2\n"
+ "st1 { v1.h }[2], [x16], #0x2\n"
+ "st1 { v5.h }[2], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x16], #0x1\n"
- "st1 { v24.b }[6], [x15], #0x1\n"
- "st1 { v23.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x8], #0x1\n"
+ "st1 { v27.b }[6], [x17], #0x1\n"
+ "st1 { v1.b }[6], [x16], #0x1\n"
+ "st1 { v5.b }[6], [x15], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x16], #0x1\n"
- "st1 { v24.b }[4], [x15], #0x1\n"
- "st1 { v23.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x8], #0x1\n"
+ "st1 { v27.b }[4], [x17], #0x1\n"
+ "st1 { v1.b }[4], [x16], #0x1\n"
+ "st1 { v5.b }[4], [x15], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v7.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x16], #0x2\n"
- "st1 { v24.h }[0], [x15], #0x2\n"
- "st1 { v23.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x8], #0x2\n"
+ "st1 { v27.h }[0], [x17], #0x2\n"
+ "st1 { v1.h }[0], [x16], #0x2\n"
+ "st1 { v5.h }[0], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x16], #0x1\n"
- "st1 { v24.b }[2], [x15], #0x1\n"
- "st1 { v23.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x8], #0x1\n"
+ "st1 { v27.b }[2], [x17], #0x1\n"
+ "st1 { v1.b }[2], [x16], #0x1\n"
+ "st1 { v5.b }[2], [x15], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x16], #0x1\n"
- "st1 { v24.b }[0], [x15], #0x1\n"
- "st1 { v23.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x8], #0x1\n"
+ "st1 { v27.b }[0], [x17], #0x1\n"
+ "st1 { v1.b }[0], [x16], #0x1\n"
+ "st1 { v5.b }[0], [x15], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 77b7d231e0..dbdcedccf3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,21 +45,21 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v7.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v7.4s }, [x21]\n"
"ld1r { v6.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v5.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v5.16b }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v3.4s }, [x21]\n"
"ld1r { v2.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
+ "ld1r { v1.4s }, [x20]\n"
"cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
@@ -68,75 +68,75 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x25, %x[inptrs]\n"
- "ldp x21, x20, [x25], #0x10\n"
- "subs x24, %x[n_points], #0x1\n"
- "ldr s14, [x21, x11]\n"
- "ldr s15, [x20, x11]\n"
+ "mov x23, %x[inptrs]\n"
+ "subs x22, %x[n_points], #0x1\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x21, x11]\n"
- "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"ssubl v20.8h, v20.8b, v6.8b\n"
"ssubl v21.8h, v21.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x23, x22, [x25], #0x10\n"
- "ldp x21, x20, [x25], #0x10\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x23, x11]\n"
- "ldr s15, [x22, x11]\n"
+ "subs x22, x22, #0x1\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x21, x11]\n"
- "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s18, [x21, x11]\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x21, x11]\n"
- "ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"ssubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"ssubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
"ssubl v17.8h, v17.8b, v6.8b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"ssubl v18.8h, v18.8b, v6.8b\n"
"ssubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"ssubl v20.8h, v20.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"ssubl v21.8h, v21.8b, v6.8b\n"
"ssubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
@@ -162,27 +162,27 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
@@ -254,17 +254,17 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s23, [x28, x11]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s24, [x27, x11]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s25, [x26, x11]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s26, [x25, x11]\n"
+ "str s23, [x28, x11]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x11]\n"
+ "str s25, [x26, x11]\n"
+ "str s26, [x25, x11]\n"
"str s27, [x24, x11]\n"
"str s28, [x23, x11]\n"
"str s29, [x22, x11]\n"
@@ -290,24 +290,24 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x27, x26, [x10], #0x10\n"
- "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x23, x22, [x10], #0x10\n"
- "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v30.16b, v23.16b\n"
- "add x9, x9, x11\n"
- "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "ldp x25, x24, [x10], #0x10\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
@@ -358,27 +358,27 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ble 15f\n"
"12:" // Oddments: Planar loop
"ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x21, [x10], #0x8\n"
- "add x9, x9, x11\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
+ "add x9, x9, x11\n"
"add x28, x28, x11\n"
- "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
- "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
@@ -465,36 +465,36 @@ void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
"add x28, x28, x11\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
"add x26, x26, x11\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
"add x24, x24, x11\n"
- "sshl v27.4s, v27.4s, v3.4s\n"
- "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x22, x22, x11\n"
- "sshl v29.4s, v29.4s, v3.4s\n"
- "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
"add x20, x20, x11\n"
- "sshl v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v2.4s\n"
- "sqrdmulh v27.4s, v27.4s, v2.4s\n"
- "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index be8fbfa0e2..ff03a6e340 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,162 +41,162 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q11, [%x[params], #0x0]\n"
+ "ldr q14, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v8.16b, #0x1\n"
- "ushr v8.4s, v8.4s, #0x8\n"
+ "movi v18.16b, #0x1\n"
+ "movi v24.4s, #0x0\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ld1 { v1.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v23.16b, v1.16b\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ld1 { v2.16b }, [x20]\n"
- "mov v30.16b, v1.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ld1 { v4.16b }, [x20]\n"
- "mov v20.16b, v2.16b\n"
- "mov v29.16b, v2.16b\n"
- "ldr x20, [%x[inptrs], #0x0]\n"
- "ld1 { v0.16b }, [x20]\n"
- "mov v9.16b, v4.16b\n"
- "mov v22.16b, v4.16b\n"
- "ldr x20, [%x[inptrs], #0x18]\n"
- "ld1 { v3.16b }, [x20]\n"
- "mov v31.16b, v4.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x2\n"
- "ext v23.16b, v23.16b, v23.16b, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "movi v28.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "ldr x25, [%x[inptrs], #0x8]\n"
+ "ldr x24, [%x[inptrs], #0x10]\n"
+ "ushr v18.4s, v18.4s, #0x8\n"
+ "movi v27.4s, #0x0\n"
+ "ldr x23, [%x[inptrs], #0x20]\n"
+ "ldr x22, [%x[inptrs], #0x0]\n"
+ "movi v21.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x2\n"
- "ext v20.16b, v20.16b, v20.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v29.16b, v29.16b, v29.16b, #0x6\n"
- "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+ "ld1 { v1.16b }, [x25]\n"
+ "ld1 { v2.16b }, [x24]\n"
+ "movi v23.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ld1 { v4.16b }, [x23]\n"
+ "ld1 { v0.16b }, [x22]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "ld1 { v3.16b }, [x21]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "mov v31.16b, v1.16b\n"
+ "mov v9.16b, v1.16b\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x4\n"
- "ext v31.16b, v31.16b, v31.16b, #0x6\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v15.4s }, [x20]\n"
- "mov v27.16b, v0.16b\n"
- "mov v19.16b, v0.16b\n"
+ "ld1r { v11.4s }, [x21]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov v16.16b, v1.16b\n"
+ "mov v30.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x2\n"
+ "add x10, %x[qp], %[offsetof_Requantize32_maxval]\n"
"cmp %x[n_channels], #0x4\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x4\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
"mov x9, #0x0\n"
- "mov v18.16b, v0.16b\n"
- "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x2\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x4\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "mov v17.16b, v3.16b\n"
- "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "neg v19.4s, v19.4s\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "ext v27.16b, v27.16b, v27.16b, #0x2\n"
- "ext v19.16b, v19.16b, v19.16b, #0x4\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
- "zip1 v1.4s, v1.4s, v23.4s\n"
- "zip1 v28.4s, v28.4s, v30.4s\n"
- "zip1 v2.4s, v2.4s, v20.4s\n"
- "zip1 v21.4s, v21.4s, v29.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x2\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "zip1 v1.4s, v1.4s, v9.4s\n"
+ "ld1r { v9.4s }, [x10]\n"
+ "zip1 v31.4s, v31.4s, v16.4s\n"
+ "mov v16.16b, v2.16b\n"
+ "zip1 v2.4s, v2.4s, v29.4s\n"
+ "mov v29.16b, v4.16b\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v22.4s\n"
- "zip1 v9.4s, v9.4s, v31.4s\n"
- "zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v27.4s, v27.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v21.4s\n"
- ".inst 0x4f81e118 // sdot v24.4s, v8.16b, v1.4b[0]\n"
- "zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v26.4s, v26.4s, v16.4s\n"
- ".inst 0x4fa1e119 // sdot v25.4s, v8.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v9.4s\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
- "movi v22.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- ".inst 0x4fa1e916 // sdot v22.4s, v8.16b, v1.4b[3]\n"
- "movi v19.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x4f82e115 // sdot v21.4s, v8.16b, v2.4b[0]\n"
- "movi v10.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- ".inst 0x4fa2e113 // sdot v19.4s, v8.16b, v2.4b[1]\n"
- "movi v18.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4f82e909 // sdot v9.4s, v8.16b, v2.4b[2]\n"
- "movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v27.4s\n"
- ".inst 0x4fa2e90a // sdot v10.4s, v8.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v26.4s\n"
- ".inst 0x4f84e114 // sdot v20.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4fa4e112 // sdot v18.4s, v8.16b, v4.4b[1]\n"
- ".inst 0x4f84e911 // sdot v17.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x4fa4e910 // sdot v16.4s, v8.16b, v4.4b[3]\n"
- "movi v31.4s, #0x0\n"
+ "zip1 v1.4s, v1.4s, v31.4s\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x4\n"
+ "zip1 v30.4s, v30.4s, v16.4s\n"
+ "mov v16.16b, v4.16b\n"
+ ".inst 0x4f81e258 // sdot v24.4s, v18.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e25c // sdot v28.4s, v18.16b, v1.4b[1]\n"
+ ".inst 0x4f81ea5a // sdot v26.4s, v18.16b, v1.4b[2]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v31.4s\n"
+ "mov v31.16b, v0.16b\n"
+ ".inst 0x4fa1ea5b // sdot v27.4s, v18.16b, v1.4b[3]\n"
+ "zip1 v2.4s, v2.4s, v30.4s\n"
+ "mov v30.16b, v0.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x2\n"
+ "zip1 v29.4s, v29.4s, v16.4s\n"
+ "mov v16.16b, v0.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x4\n"
+ ".inst 0x4f82e255 // sdot v21.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e24c // sdot v12.4s, v18.16b, v2.4b[1]\n"
+ ".inst 0x4f82ea4d // sdot v13.4s, v18.16b, v2.4b[2]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v29.4s\n"
+ "mov v29.16b, v3.16b\n"
+ ".inst 0x4fa2ea4f // sdot v15.4s, v18.16b, v2.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v30.4s\n"
+ "mov v30.16b, v3.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "zip1 v31.4s, v31.4s, v16.4s\n"
+ "mov v16.16b, v3.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x4\n"
+ ".inst 0x4f84e257 // sdot v23.4s, v18.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e248 // sdot v8.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4f84ea54 // sdot v20.4s, v18.16b, v4.4b[2]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v0.4s, v0.4s, v31.4s\n"
+ ".inst 0x4fa4ea51 // sdot v17.4s, v18.16b, v4.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v30.4s\n"
"movi v30.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x4f80e11f // sdot v31.4s, v8.16b, v0.4b[0]\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- ".inst 0x4fa0e11e // sdot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v31.4s, #0x0\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "zip1 v29.4s, v29.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x4f80e256 // sdot v22.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e259 // sdot v25.4s, v18.16b, v0.4b[1]\n"
+ ".inst 0x4f80ea5e // sdot v30.4s, v18.16b, v0.4b[2]\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ ".inst 0x4fa0ea5f // sdot v31.4s, v18.16b, v0.4b[3]\n"
+ "add v27.4s, v27.4s, v15.4s\n"
+ "zip1 v3.4s, v3.4s, v29.4s\n"
"movi v29.4s, #0x0\n"
- ".inst 0x4f80e91a // sdot v26.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x4fa0e91b // sdot v27.4s, v8.16b, v0.4b[3]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4fa3e11d // sdot v29.4s, v8.16b, v3.4b[1]\n"
- "add v24.4s, v24.4s, v21.4s\n"
- "add v25.4s, v25.4s, v19.4s\n"
- "add v23.4s, v23.4s, v9.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "add v21.4s, v20.4s, v21.4s\n"
- "movi v20.4s, #0x0\n"
- ".inst 0x4f83e914 // sdot v20.4s, v8.16b, v3.4b[2]\n"
- "add v19.4s, v18.4s, v19.4s\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x4fa3e912 // sdot v18.4s, v8.16b, v3.4b[3]\n"
- "add v17.4s, v17.4s, v9.4s\n"
- "add v16.4s, v16.4s, v10.4s\n"
- "add v24.4s, v24.4s, v31.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v23.4s, v26.4s\n"
- "add v27.4s, v22.4s, v27.4s\n"
- "add v28.4s, v21.4s, v28.4s\n"
- "add v29.4s, v19.4s, v29.4s\n"
- "add v30.4s, v17.4s, v20.4s\n"
- "add v31.4s, v16.4s, v18.4s\n"
- "neg v12.4s, v12.4s\n"
- "mul v24.4s, v24.4s, v12.4s\n"
- "mul v25.4s, v25.4s, v12.4s\n"
- "mul v26.4s, v26.4s, v12.4s\n"
- "mul v27.4s, v27.4s, v12.4s\n"
- "mul v28.4s, v28.4s, v12.4s\n"
- "mul v29.4s, v29.4s, v12.4s\n"
- "mul v30.4s, v30.4s, v12.4s\n"
- "mul v31.4s, v31.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "movi v21.4s, #0x0\n"
+ "add v12.4s, v8.4s, v12.4s\n"
+ "movi v8.4s, #0x0\n"
+ ".inst 0x4f83e250 // sdot v16.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e25d // sdot v29.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4f83ea55 // sdot v21.4s, v18.16b, v3.4b[2]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ ".inst 0x4fa3ea48 // sdot v8.4s, v18.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v28.4s, v25.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v31.4s\n"
+ "add v28.4s, v23.4s, v16.4s\n"
+ "add v29.4s, v12.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v21.4s\n"
+ "add v31.4s, v17.4s, v8.4s\n"
+ "mul v24.4s, v24.4s, v19.4s\n"
+ "mul v25.4s, v25.4s, v19.4s\n"
+ "mul v26.4s, v26.4s, v19.4s\n"
+ "mul v27.4s, v27.4s, v19.4s\n"
+ "mul v28.4s, v28.4s, v19.4s\n"
+ "mul v29.4s, v29.4s, v19.4s\n"
+ "mul v30.4s, v30.4s, v19.4s\n"
+ "mul v31.4s, v31.4s, v19.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v11.4s\n"
- "add v25.4s, v25.4s, v11.4s\n"
- "add v26.4s, v26.4s, v11.4s\n"
- "add v27.4s, v27.4s, v11.4s\n"
- "add v28.4s, v28.4s, v11.4s\n"
- "add v29.4s, v29.4s, v11.4s\n"
- "add v30.4s, v30.4s, v11.4s\n"
- "add v31.4s, v31.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"ble 2f\n"
"1:" // Loop
"ldr q8, [%x[params], #0x0]\n"
@@ -207,96 +207,96 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
- "cmp %x[n_channels], #0x4\n"
- "add x9, x9, #0x10\n"
- ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
"ldr q5, [%x[params], #0x30]\n"
- ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v21.16b\n"
".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "and v19.16b, v24.16b, v21.16b\n"
"and v18.16b, v25.16b, v21.16b\n"
"and v17.16b, v26.16b, v21.16b\n"
"and v16.16b, v27.16b, v21.16b\n"
- "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
+ "and v18.16b, v29.16b, v21.16b\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
"and v17.16b, v30.16b, v21.16b\n"
"and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v21.4s\n"
- "srshl v25.4s, v25.4s, v21.4s\n"
"srshl v26.4s, v26.4s, v21.4s\n"
"srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v21.4s\n"
"srshl v29.4s, v29.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
"srshl v30.4s, v30.4s, v21.4s\n"
"srshl v31.4s, v31.4s, v21.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v9.4s\n"
+ "smin v27.4s, v27.4s, v9.4s\n"
+ "smin v28.4s, v28.4s, v9.4s\n"
+ "smin v29.4s, v29.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v10.4s\n"
+ "smax v25.4s, v25.4s, v10.4s\n"
+ "smax v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v10.4s\n"
+ "smax v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v10.4s\n"
+ "smax v30.4s, v30.4s, v10.4s\n"
+ "smax v31.4s, v31.4s, v10.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -307,33 +307,33 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s24, [x27, x28]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s25, [x26, x28]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s26, [x25, x28]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s27, [x24, x28]\n"
- "str s28, [x23, x28]\n"
+ "str s24, [x27, x28]\n"
+ "str s25, [x26, x28]\n"
"dup v24.4s, v22.s[0]\n"
"dup v25.4s, v22.s[1]\n"
- "str s29, [x22, x28]\n"
+ "str s26, [x25, x28]\n"
"dup v26.4s, v22.s[2]\n"
+ "str s27, [x24, x28]\n"
"dup v27.4s, v22.s[3]\n"
- "str s30, [x21, x28]\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str s28, [x23, x28]\n"
"dup v28.4s, v23.s[0]\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "str s29, [x22, x28]\n"
"dup v29.4s, v23.s[1]\n"
- "str s31, [x20, x28]\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "str s30, [x21, x28]\n"
"dup v30.4s, v23.s[2]\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "str s31, [x20, x28]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v20.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v26.4s, v26.4s, v20.4s\n"
- "add v27.4s, v27.4s, v20.4s\n"
"add v28.4s, v28.4s, v20.4s\n"
"add v29.4s, v29.4s, v20.4s\n"
"add v30.4s, v30.4s, v20.4s\n"
@@ -348,98 +348,98 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
"add x27, x27, x28\n"
- ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
"add x26, x26, x28\n"
"add x25, x25, x28\n"
- ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
"add x24, x24, x28\n"
"add x23, x23, x28\n"
- ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
"add x22, x22, x28\n"
"add x21, x21, x28\n"
- ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
"add x20, x20, x28\n"
"add %x[params], %x[params], #0x20\n"
- ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
"and v18.16b, v29.16b, v20.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
"and v17.16b, v30.16b, v20.16b\n"
"and v16.16b, v31.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v20.4s\n"
"srshl v29.4s, v29.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v9.4s\n"
+ "smin v27.4s, v27.4s, v9.4s\n"
+ "smin v28.4s, v28.4s, v9.4s\n"
+ "smin v29.4s, v29.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v10.4s\n"
+ "smax v25.4s, v25.4s, v10.4s\n"
+ "smax v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v10.4s\n"
+ "smax v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v10.4s\n"
+ "smax v30.4s, v30.4s, v10.4s\n"
+ "smax v31.4s, v31.4s, v10.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -509,7 +509,7 @@ void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"4:" // Tail: End
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 17afc92e30..83962606c8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,158 +41,158 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q12, [%x[params], #0x0]\n"
+ "ldr q22, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v30.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
+ "movi v23.16b, #0x1\n"
+ "movi v19.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v16.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v24.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "ld1 { v3.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "mov v26.16b, v3.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "ld1 { v4.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "mov v21.16b, v4.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- "ld1 { v2.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "mov v27.16b, v2.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- "ld1 { v1.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v26.2d\n"
- "zip1 v4.2d, v4.2d, v21.2d\n"
- "ld1 { v5.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "mov v26.16b, v1.16b\n"
- "mov v22.16b, v5.16b\n"
- "ld1 { v6.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x38]\n"
- "mov v19.16b, v6.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "ld1 { v7.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x0]\n"
- "mov v21.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v27.2d\n"
- "ld1 { v0.16b }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x4f83e3d1 // sdot v17.4s, v30.16b, v3.4b[0]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4f83ebd0 // sdot v16.4s, v30.16b, v3.4b[2]\n"
- ".inst 0x4f84e3d9 // sdot v25.4s, v30.16b, v4.4b[0]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v23.4s }, [x20]\n"
- ".inst 0x4f84ebd8 // sdot v24.4s, v30.16b, v4.4b[2]\n"
- "mov v18.16b, v0.16b\n"
- ".inst 0x4f82e3df // sdot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v28.4s, #0x0\n"
+ "ldr x24, [%x[inptrs], #0x20]\n"
+ "ldr x23, [%x[inptrs], #0x10]\n"
"movi v29.4s, #0x0\n"
- "movi v28.4s, #0x1\n"
- ".inst 0x4f82ebdd // sdot v29.4s, v30.16b, v2.4b[2]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x4fa3e391 // sdot v17.4s, v28.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v22.2d\n"
- "zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x4fa3eb90 // sdot v16.4s, v28.16b, v3.4b[3]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v21.2d\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v25.4s, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x8]\n"
+ "ldr x21, [%x[inptrs], #0x28]\n"
"movi v21.4s, #0x0\n"
- ".inst 0x4fa4eb98 // sdot v24.4s, v28.16b, v4.4b[3]\n"
- ".inst 0x4f81e3d6 // sdot v22.4s, v30.16b, v1.4b[0]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "movi v16.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ "ld1 { v4.16b }, [x24]\n"
+ "ld1 { v2.16b }, [x23]\n"
+ "movi v30.4s, #0x0\n"
"movi v20.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x4f85e3da // sdot v26.4s, v30.16b, v5.4b[0]\n"
- "cmp %x[n_channels], #0x4\n"
- "zip1 v0.2d, v0.2d, v18.2d\n"
+ "ld1 { v1.16b }, [x22]\n"
+ "ld1 { v5.16b }, [x21]\n"
+ "movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
- ".inst 0x4f85ebdb // sdot v27.4s, v30.16b, v5.4b[2]\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "mov v7.16b, v3.16b\n"
+ "ldr x22, [%x[inptrs], #0x38]\n"
+ "movi v24.4s, #0x0\n"
+ "mov v0.16b, v4.16b\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov v14.16b, v2.16b\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "add x11, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "add x10, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
- ".inst 0x4f86e3d4 // sdot v20.4s, v30.16b, v6.4b[0]\n"
- ".inst 0x4f86ebd3 // sdot v19.4s, v30.16b, v6.4b[2]\n"
- "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- "movi v25.4s, #0x0\n"
- ".inst 0x4f87e3d2 // sdot v18.4s, v30.16b, v7.4b[0]\n"
- ".inst 0x4f87ebd9 // sdot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- ".inst 0x4fa2e39f // sdot v31.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x4fa2eb9d // sdot v29.4s, v28.16b, v2.4b[3]\n"
- "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x4f80e3d8 // sdot v24.4s, v30.16b, v0.4b[0]\n"
- ".inst 0x4fa1e396 // sdot v22.4s, v28.16b, v1.4b[1]\n"
+ "zip1 v3.2d, v3.2d, v7.2d\n"
+ "ld1 { v7.16b }, [x22]\n"
+ "neg v12.4s, v12.4s\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x4fa1eb95 // sdot v21.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x4fa5e39a // sdot v26.4s, v28.16b, v5.4b[1]\n"
- "add v31.4s, v31.4s, v17.4s\n"
+ "zip1 v4.2d, v4.2d, v0.2d\n"
+ "ld1 { v0.16b }, [x21]\n"
+ "zip1 v2.2d, v2.2d, v14.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x4fa5eb9b // sdot v27.4s, v28.16b, v5.4b[3]\n"
- ".inst 0x4fa6e394 // sdot v20.4s, v28.16b, v6.4b[1]\n"
- "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- ".inst 0x4fa6eb93 // sdot v19.4s, v28.16b, v6.4b[3]\n"
- ".inst 0x4fa7e392 // sdot v18.4s, v28.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v31.4s\n"
- ".inst 0x4fa7eb99 // sdot v25.4s, v28.16b, v7.4b[3]\n"
- ".inst 0x4fa0e398 // sdot v24.4s, v28.16b, v0.4b[1]\n"
- "add v21.4s, v21.4s, v29.4s\n"
- "add v20.4s, v26.4s, v20.4s\n"
- "add v19.4s, v27.4s, v19.4s\n"
- "add v18.4s, v18.4s, v17.4s\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
- ".inst 0x4fa0eb91 // sdot v17.4s, v28.16b, v0.4b[3]\n"
- "add v16.4s, v25.4s, v16.4s\n"
- "add v24.4s, v22.4s, v24.4s\n"
- "add v25.4s, v21.4s, v17.4s\n"
- "add v26.4s, v26.4s, v22.4s\n"
- "add v27.4s, v27.4s, v21.4s\n"
- "add v28.4s, v20.4s, v31.4s\n"
- "add v29.4s, v19.4s, v29.4s\n"
- "add v30.4s, v20.4s, v18.4s\n"
- "add v31.4s, v19.4s, v16.4s\n"
- "neg v23.4s, v23.4s\n"
- "mul v24.4s, v24.4s, v23.4s\n"
- "mul v25.4s, v25.4s, v23.4s\n"
- "mul v26.4s, v26.4s, v23.4s\n"
- "mul v27.4s, v27.4s, v23.4s\n"
- "mul v28.4s, v28.4s, v23.4s\n"
- "mul v29.4s, v29.4s, v23.4s\n"
- "mul v30.4s, v30.4s, v23.4s\n"
- "mul v31.4s, v31.4s, v23.4s\n"
- "zip1 v19.4s, v24.4s, v26.4s\n"
- "zip1 v18.4s, v25.4s, v27.4s\n"
+ ".inst 0x4f83e2f3 // sdot v19.4s, v23.16b, v3.4b[0]\n"
+ ".inst 0x4f83eaed // sdot v13.4s, v23.16b, v3.4b[2]\n"
+ ".inst 0x4f84e2ef // sdot v15.4s, v23.16b, v4.4b[0]\n"
+ ".inst 0x4f84eaff // sdot v31.4s, v23.16b, v4.4b[2]\n"
+ ".inst 0x4f82e2fc // sdot v28.4s, v23.16b, v2.4b[0]\n"
+ ".inst 0x4f82eafd // sdot v29.4s, v23.16b, v2.4b[2]\n"
+ ".inst 0x4fa3e333 // sdot v19.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa3eb2d // sdot v13.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e32f // sdot v15.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa4eb3f // sdot v31.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa2e33c // sdot v28.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa2eb3d // sdot v29.4s, v25.16b, v2.4b[3]\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "ld1r { v15.4s }, [x11]\n"
+ "add v31.4s, v13.4s, v31.4s\n"
+ "mov v13.16b, v1.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "add v28.4s, v28.4s, v19.4s\n"
+ "add v29.4s, v29.4s, v31.4s\n"
+ "zip1 v1.2d, v1.2d, v13.2d\n"
+ "mov v13.16b, v5.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4f81e2f5 // sdot v21.4s, v23.16b, v1.4b[0]\n"
+ ".inst 0x4f81eaf0 // sdot v16.4s, v23.16b, v1.4b[2]\n"
+ "zip1 v5.2d, v5.2d, v13.2d\n"
+ "mov v13.16b, v6.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4f85e2fa // sdot v26.4s, v23.16b, v5.4b[0]\n"
+ ".inst 0x4f85eafb // sdot v27.4s, v23.16b, v5.4b[2]\n"
+ ".inst 0x4fa1e335 // sdot v21.4s, v25.16b, v1.4b[1]\n"
+ "zip1 v6.2d, v6.2d, v13.2d\n"
+ "mov v13.16b, v7.16b\n"
+ ".inst 0x4fa1eb30 // sdot v16.4s, v25.16b, v1.4b[3]\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4f86e2fe // sdot v30.4s, v23.16b, v6.4b[0]\n"
+ ".inst 0x4f86eaf4 // sdot v20.4s, v23.16b, v6.4b[2]\n"
+ ".inst 0x4fa5e33a // sdot v26.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "zip1 v7.2d, v7.2d, v13.2d\n"
+ "ld1r { v13.4s }, [x10]\n"
+ "add v16.4s, v16.4s, v29.4s\n"
+ ".inst 0x4fa6e33e // sdot v30.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4fa6eb34 // sdot v20.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4f87e2f1 // sdot v17.4s, v23.16b, v7.4b[0]\n"
+ ".inst 0x4f87eaf2 // sdot v18.4s, v23.16b, v7.4b[2]\n"
+ "add v30.4s, v26.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4fa7e331 // sdot v17.4s, v25.16b, v7.4b[1]\n"
+ "add v20.4s, v27.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v16.4s\n"
+ "add v28.4s, v30.4s, v28.4s\n"
+ ".inst 0x4fa7eb32 // sdot v18.4s, v25.16b, v7.4b[3]\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "add v19.4s, v17.4s, v19.4s\n"
+ "mov v17.16b, v0.16b\n"
+ "add v29.4s, v20.4s, v29.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "add v31.4s, v18.4s, v31.4s\n"
+ "movi v18.4s, #0x0\n"
+ "add v30.4s, v30.4s, v19.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "zip1 v0.2d, v0.2d, v17.2d\n"
+ "add v31.4s, v20.4s, v31.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ ".inst 0x4f80e2f8 // sdot v24.4s, v23.16b, v0.4b[0]\n"
+ ".inst 0x4f80eaf2 // sdot v18.4s, v23.16b, v0.4b[2]\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
- "zip1 v16.4s, v29.4s, v31.4s\n"
- "zip1 v22.4s, v19.4s, v18.4s\n"
- "zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v22.4s\n"
+ "add v30.4s, v30.4s, v22.4s\n"
+ ".inst 0x4fa0e338 // sdot v24.4s, v25.16b, v0.4b[1]\n"
+ "zip1 v19.4s, v29.4s, v31.4s\n"
+ "add v29.4s, v29.4s, v22.4s\n"
+ ".inst 0x4fa0eb32 // sdot v18.4s, v25.16b, v0.4b[3]\n"
+ "add v31.4s, v31.4s, v22.4s\n"
+ "add v24.4s, v21.4s, v24.4s\n"
+ "zip1 v23.4s, v17.4s, v19.4s\n"
+ "add v25.4s, v16.4s, v18.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "zip1 v17.4s, v24.4s, v26.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "zip1 v16.4s, v25.4s, v27.4s\n"
+ "add v27.4s, v27.4s, v22.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v25.4s, v22.4s\n"
+ "zip1 v22.4s, v17.4s, v16.4s\n"
"ble 2f\n"
"1:" // Loop
"ldr q12, [%x[params], #0x60]\n"
@@ -203,159 +203,159 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
- "cmp %x[n_channels], #0x4\n"
- "add x9, x9, #0x10\n"
- ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
"ldr q17, [%x[params], #0x0]\n"
- ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q16, [%x[params], #0x10]\n"
- ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
- ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
- ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q19, [%x[params], #0x20]\n"
- ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
- ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
- ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
- ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q18, [%x[params], #0x30]\n"
- ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
- ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
- ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
- ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
"ldr q17, [%x[params], #0x40]\n"
- ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
- ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
- ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
- ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
"ldr q16, [%x[params], #0x50]\n"
- ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
- ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
- ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
- ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
- ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
- ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
- ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
- ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v12.4s\n"
- ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
- ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v12.4s\n"
".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v21.16b\n"
".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "and v19.16b, v24.16b, v21.16b\n"
"and v18.16b, v25.16b, v21.16b\n"
"and v17.16b, v26.16b, v21.16b\n"
"and v16.16b, v27.16b, v21.16b\n"
- "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v12.4s\n"
- "sqrdmulh v29.4s, v29.4s, v12.4s\n"
- "sqrdmulh v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v31.4s, v31.4s, v12.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
+ "and v18.16b, v29.16b, v21.16b\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
"and v17.16b, v30.16b, v21.16b\n"
"and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v21.4s\n"
- "srshl v25.4s, v25.4s, v21.4s\n"
"srshl v26.4s, v26.4s, v21.4s\n"
"srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v21.4s\n"
"srshl v29.4s, v29.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v21.4s\n"
"srshl v31.4s, v31.4s, v21.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v13.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "add v31.4s, v31.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -366,33 +366,33 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s24, [x27, x28]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s25, [x26, x28]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s26, [x25, x28]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s27, [x24, x28]\n"
- "str s28, [x23, x28]\n"
+ "str s24, [x27, x28]\n"
+ "str s25, [x26, x28]\n"
"dup v24.4s, v22.s[0]\n"
"dup v25.4s, v22.s[1]\n"
- "str s29, [x22, x28]\n"
+ "str s26, [x25, x28]\n"
"dup v26.4s, v22.s[2]\n"
+ "str s27, [x24, x28]\n"
"dup v27.4s, v22.s[3]\n"
- "str s30, [x21, x28]\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str s28, [x23, x28]\n"
"dup v28.4s, v23.s[0]\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "str s29, [x22, x28]\n"
"dup v29.4s, v23.s[1]\n"
- "str s31, [x20, x28]\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "str s30, [x21, x28]\n"
"dup v30.4s, v23.s[2]\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "str s31, [x20, x28]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v20.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v26.4s, v26.4s, v20.4s\n"
- "add v27.4s, v27.4s, v20.4s\n"
"add v28.4s, v28.4s, v20.4s\n"
"add v29.4s, v29.4s, v20.4s\n"
"add v30.4s, v30.4s, v20.4s\n"
@@ -407,160 +407,160 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
"cmp %x[n_channels], #0x4\n"
"add x27, x27, x28\n"
- ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
"add x26, x26, x28\n"
"add x25, x25, x28\n"
- ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x24, x24, x28\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
"add x23, x23, x28\n"
- ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
"add x22, x22, x28\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
"add x21, x21, x28\n"
- ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
- ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q16, [%x[params], #0x10]\n"
- ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
- ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
- ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q19, [%x[params], #0x20]\n"
- ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
- ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
- ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
- ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q18, [%x[params], #0x30]\n"
- ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
- ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
- ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
- ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
"ldr q17, [%x[params], #0x40]\n"
- ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
- ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
- ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
- ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
"ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
- ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
- ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
- ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
- ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
- ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
- ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
- ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
- ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
- ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
- ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
"and v18.16b, v29.16b, v20.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
"and v17.16b, v30.16b, v20.16b\n"
"and v16.16b, v31.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v20.4s\n"
"srshl v29.4s, v29.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v13.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "add v31.4s, v31.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -630,7 +630,7 @@ void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"4:" // Tail: End
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index b21ad484e5..be3c8cf9f8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,21 +49,21 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v15.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.4s }, [x21]\n"
"ld1r { v13.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.16b }, [x21]\n"
"ld1r { v11.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v10.4s }, [x21]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
+ "ld1r { v8.4s }, [x20]\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
"movi v31.4s, #0x0\n"
@@ -96,20 +96,20 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"3:" // Output channel loop: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 7f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
@@ -125,13 +125,13 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -139,22 +139,22 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -172,54 +172,54 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v19.4s, v5.4h, v0.h[3]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v7.4h, v3.h[0]\n"
- "smlal v17.4s, v7.4h, v3.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v7.4h, v3.h[2]\n"
- "smlal v19.4s, v7.4h, v3.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v7.4h, v6.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -357,49 +357,49 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -421,70 +421,70 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x20, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr s5, [%x[weights]], #0x4\n"
"ldr d4, [x28, #0x0]\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "smlal v20.4s, v7.4h, v3.h[4]\n"
- "smlal v21.4s, v7.4h, v3.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "smlal v22.4s, v7.4h, v3.h[6]\n"
- "smlal v23.4s, v7.4h, v3.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -622,49 +622,49 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -673,45 +673,45 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"7:" // Output channel loop: Single kernel point
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
- "smlal v20.4s, v5.4h, v0.h[4]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
"and v3.16b, v16.16b, v8.16b\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
"and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
"and v1.16b, v18.16b, v8.16b\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "smlal v29.4s, v5.4h, v4.h[5]\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v30.4s, v5.4h, v4.h[6]\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
- "sshl v24.4s, v24.4s, v10.4s\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
@@ -848,49 +848,49 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -965,20 +965,20 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"18:" // Output channel oddments: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 22f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
@@ -994,13 +994,13 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -1008,22 +1008,22 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -1077,27 +1077,27 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d2, [x21, #0x0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d1, [x20, #0x0]\n"
"ldr s0, [%x[weights]], #0x4\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
@@ -1145,18 +1145,18 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v10.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
"and v3.16b, v16.16b, v8.16b\n"
"and v2.16b, v17.16b, v8.16b\n"
"and v1.16b, v18.16b, v8.16b\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -1320,47 +1320,47 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"tbz %x[n_output_channels], #1, 24f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.h }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.h }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.h }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.h }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.h }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.h }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.h }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.h }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
- "add x9, x9, #0x2\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.h }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.h }[0], [x26]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
"st1 { v26.h }[0], [x25]\n"
"st1 { v27.h }[0], [x24]\n"
"st1 { v28.h }[0], [x23]\n"
@@ -1370,46 +1370,46 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"tbz %x[n_output_channels], #0, 25f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[2], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[2], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[2], [x25]\n"
"st1 { v27.b }[2], [x24]\n"
"st1 { v28.b }[2], [x23]\n"
@@ -1420,46 +1420,46 @@ void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[0], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[0], [x25]\n"
"st1 { v27.b }[0], [x24]\n"
"st1 { v28.b }[0], [x23]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index aad34c4c25..80a2deae4a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,1441 +35,1441 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_cha
__asm__ __volatile__(
"lsr x15, %x[n_channels], #0x4\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v8.4s }, [x20]\n"
- "ldp x14, x13, [%x[inptrs], #0x0]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v12.4s }, [x20]\n"
+ "ldp x14, x27, [%x[inptrs], #0x0]\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "ldp x24, x23, [%x[inptrs], #0x20]\n"
+ "ld1r { v11.4s }, [x21]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "mov x13, #0x0\n"
"mov x12, #0x0\n"
- "mov x11, #0x0\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "ldp x25, x24, [%x[outptrs], #0x0]\n"
- "ldp x23, x22, [%x[outptrs], #0x10]\n"
+ "ldp x22, x21, [%x[inptrs], #0x30]\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
"cbz x15, 3f\n"
- "ldr q11, [x14, x12]\n"
- "ldr q20, [x13, x12]\n"
+ "ldr q12, [x14, x13]\n"
+ "ldr q24, [x27, x13]\n"
"subs x15, x15, #0x1\n"
- "ldr q16, [x10, x12]\n"
- "ldr q14, [x9, x12]\n"
- "zip2 v19.16b, v11.16b, v16.16b\n"
- "zip1 v11.16b, v11.16b, v16.16b\n"
- "ldr q13, [x28, x12]\n"
- "ldr q18, [x27, x12]\n"
- "zip1 v17.16b, v20.16b, v14.16b\n"
- "zip2 v14.16b, v20.16b, v14.16b\n"
- "ldr q16, [x26, x12]\n"
- "ldr q27, [x21, x12]\n"
- "zip2 v10.16b, v11.16b, v17.16b\n"
- "zip1 v11.16b, v11.16b, v17.16b\n"
- "ldr q24, [%x[params], #0x10]\n"
- "ldr q9, [%x[params], #0x20]\n"
- "zip1 v3.16b, v19.16b, v14.16b\n"
- "zip2 v14.16b, v19.16b, v14.16b\n"
- "ldr q31, [%x[params], #0x0]\n"
- "ldr q6, [%x[params], #0x30]\n"
- "zip2 v30.16b, v13.16b, v16.16b\n"
- "zip1 v13.16b, v13.16b, v16.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q5, [x21, x12]\n"
- "zip1 v16.16b, v18.16b, v27.16b\n"
- "zip2 v27.16b, v18.16b, v27.16b\n"
- "ldr q17, [x20, x12]\n"
- "ldp x21, x20, [%x[inptrs], #0x50]\n"
- "zip2 v28.16b, v13.16b, v16.16b\n"
- "zip1 v13.16b, v13.16b, v16.16b\n"
- "ldr q16, [x21, x12]\n"
- "ldr q7, [x20, x12]\n"
- "zip2 v20.16b, v5.16b, v16.16b\n"
- "zip1 v5.16b, v5.16b, v16.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q16, [x21, x12]\n"
- "zip1 v22.16b, v17.16b, v7.16b\n"
- "zip2 v7.16b, v17.16b, v7.16b\n"
- "ldr q19, [x20, x12]\n"
+ "ldr q10, [x26, x13]\n"
+ "ldr q14, [x25, x13]\n"
+ "ldr q15, [x24, x13]\n"
+ "ldr q20, [x23, x13]\n"
+ "ldr q16, [x22, x13]\n"
+ "ldr q28, [x21, x13]\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v19.16b, v12.16b, v10.16b\n"
+ "zip1 v12.16b, v12.16b, v10.16b\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "zip1 v17.16b, v24.16b, v14.16b\n"
+ "zip2 v14.16b, v24.16b, v14.16b\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "zip2 v18.16b, v15.16b, v16.16b\n"
+ "zip1 v15.16b, v15.16b, v16.16b\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip1 v21.16b, v30.16b, v27.16b\n"
- "zip2 v27.16b, v30.16b, v27.16b\n"
- "ldr q30, [x21, x12]\n"
- "ldr q1, [x20, x12]\n"
- "zip2 v17.16b, v16.16b, v30.16b\n"
- "zip1 v16.16b, v16.16b, v30.16b\n"
- "zip1 v18.16b, v19.16b, v1.16b\n"
- "zip2 v1.16b, v19.16b, v1.16b\n"
- "ldp x14, x13, [%x[inptrs], #0x0]\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "zip2 v29.16b, v5.16b, v22.16b\n"
- "zip1 v5.16b, v5.16b, v22.16b\n"
- "zip1 v0.16b, v20.16b, v7.16b\n"
- "zip2 v7.16b, v20.16b, v7.16b\n"
+ "zip1 v16.16b, v20.16b, v28.16b\n"
+ "zip2 v28.16b, v20.16b, v28.16b\n"
+ "ldr q8, [x27, x13]\n"
+ "ldr q21, [x26, x13]\n"
+ "zip2 v23.16b, v12.16b, v17.16b\n"
+ "zip1 v12.16b, v12.16b, v17.16b\n"
+ "ldp x14, x27, [%x[inptrs], #0x0]\n"
+ "ldr q17, [x25, x13]\n"
+ "ldr q22, [x24, x13]\n"
+ "zip1 v30.16b, v19.16b, v14.16b\n"
+ "zip2 v14.16b, v19.16b, v14.16b\n"
+ "ldr q9, [x23, x13]\n"
+ "ldr q20, [x22, x13]\n"
+ "zip2 v5.16b, v15.16b, v16.16b\n"
+ "zip1 v15.16b, v15.16b, v16.16b\n"
+ "ldr q16, [x21, x13]\n"
+ "ldr q2, [x20, x13]\n"
+ "zip1 v7.16b, v18.16b, v28.16b\n"
+ "zip2 v28.16b, v18.16b, v28.16b\n"
+ "ldr q3, [%x[params], #0x0]\n"
+ "zip2 v19.16b, v8.16b, v17.16b\n"
+ "zip1 v8.16b, v8.16b, v17.16b\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "zip1 v18.16b, v21.16b, v22.16b\n"
+ "zip2 v22.16b, v21.16b, v22.16b\n"
+ "ldp x24, x23, [%x[inptrs], #0x20]\n"
+ "ldp x22, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v17.16b, v9.16b, v16.16b\n"
+ "zip1 v9.16b, v9.16b, v16.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v30.16b, v16.16b, v18.16b\n"
- "zip1 v16.16b, v16.16b, v18.16b\n"
- "zip1 v2.16b, v17.16b, v1.16b\n"
- "zip2 v1.16b, v17.16b, v1.16b\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- "mov v4.16b, v31.16b\n"
+ "zip1 v16.16b, v20.16b, v2.16b\n"
+ "zip2 v2.16b, v20.16b, v2.16b\n"
+ "zip2 v21.16b, v8.16b, v18.16b\n"
+ "zip1 v8.16b, v8.16b, v18.16b\n"
+ "zip1 v29.16b, v19.16b, v22.16b\n"
+ "zip2 v22.16b, v19.16b, v22.16b\n"
+ "zip2 v1.16b, v9.16b, v16.16b\n"
+ "zip1 v9.16b, v9.16b, v16.16b\n"
+ "zip1 v31.16b, v17.16b, v2.16b\n"
+ "zip2 v2.16b, v17.16b, v2.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
"beq 2f\n"
"1:" // Loop
- ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
- ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
- "ext v11.16b, v11.16b, v11.16b, #0x1\n"
- "add x12, x12, #0x10\n"
- ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
- ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
- "ldr q17, [%x[params], #0x0]\n"
- ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
- ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e8c9483 // sdot v3.4s, v4.16b, v12.16b\n"
+ ".inst 0x4e8f9480 // sdot v0.4s, v4.16b, v15.16b\n"
+ "ext v12.16b, v12.16b, v12.16b, #0x1\n"
+ "add x13, x13, #0x10\n"
"subs x15, x15, #0x1\n"
- ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ ".inst 0x4e8c949a // sdot v26.4s, v4.16b, v12.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8f94c3 // sdot v3.4s, v6.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e8894c0 // sdot v0.4s, v6.16b, v8.16b\n"
+ ".inst 0x4e8f9492 // sdot v18.4s, v4.16b, v15.16b\n"
+ ".inst 0x4e889543 // sdot v3.4s, v10.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8f94da // sdot v26.4s, v6.16b, v15.16b\n"
"ldr q20, [%x[params], #0x10]\n"
- ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
- ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
- ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ ".inst 0x4e8894d2 // sdot v18.4s, v6.16b, v8.16b\n"
+ ".inst 0x4e899540 // sdot v0.4s, v10.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ ".inst 0x4e88955a // sdot v26.4s, v10.16b, v8.16b\n"
+ ".inst 0x4e899552 // sdot v18.4s, v10.16b, v9.16b\n"
+ "and v16.16b, v3.16b, v20.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "ldr q5, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v19.16b, v26.16b, v20.16b\n"
- "and v17.16b, v18.16b, v20.16b\n"
- "and v16.16b, v4.16b, v20.16b\n"
+ "ldr q6, [%x[params], #0x60]\n"
+ "and v19.16b, v0.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v18.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "sqadd v26.4s, v26.4s, v19.4s\n"
- "ldr q13, [%x[params], #0x40]\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "ldr q17, [%x[params], #0x50]\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
+ "sqadd v0.4s, v0.4s, v19.4s\n"
+ "ldr q8, [%x[params], #0x50]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
"ldr q16, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v0.4s, v0.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
"srshl v18.4s, v18.4s, v20.4s\n"
- "srshl v4.4s, v4.4s, v20.4s\n"
- "ldr q22, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "ldr q12, [%x[params], #0x70]\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s31, [x25, x11]\n"
+ "str s3, [x11, x12]\n"
"ldr q24, [%x[params], #0x20]\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s26, [x24, x11]\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s18, [x23, x11]\n"
- "mov v26.16b, v24.16b\n"
- "str s4, [x22, x11]\n"
- "mov v25.16b, v24.16b\n"
- "mov v23.16b, v24.16b\n"
- ".inst 0x4e8a9618 // sdot v24.4s, v16.16b, v10.16b\n"
- ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c95b8 // sdot v24.4s, v13.16b, v28.16b\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e8a961a // sdot v26.4s, v16.16b, v10.16b\n"
- "ldr q10, [x13, x12]\n"
- ".inst 0x4e9c9617 // sdot v23.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9d95b9 // sdot v25.4s, v13.16b, v29.16b\n"
- ".inst 0x4e9d9638 // sdot v24.4s, v17.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e9c95ba // sdot v26.4s, v13.16b, v28.16b\n"
- "ldr q20, [x27, x12]\n"
- ".inst 0x4e9d95b7 // sdot v23.4s, v13.16b, v29.16b\n"
- "sqrdmulh v24.4s, v24.4s, v5.4s\n"
- ".inst 0x4e9e9639 // sdot v25.4s, v17.16b, v30.16b\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e9d963a // sdot v26.4s, v17.16b, v29.16b\n"
- ".inst 0x4e9e9637 // sdot v23.4s, v17.16b, v30.16b\n"
- "and v16.16b, v24.16b, v22.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v15.16b, v24.16b\n"
+ "str s0, [x9, x12]\n"
+ "mov v20.16b, v24.16b\n"
+ "str s18, [x28, x12]\n"
+ "mov v4.16b, v24.16b\n"
+ ".inst 0x4e979618 // sdot v24.4s, v16.16b, v23.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e859614 // sdot v20.4s, v16.16b, v5.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e859638 // sdot v24.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e97960f // sdot v15.4s, v16.16b, v23.16b\n"
+ "ldr q3, [x27, x13]\n"
+ ".inst 0x4e859604 // sdot v4.4s, v16.16b, v5.16b\n"
+ ".inst 0x4e959634 // sdot v20.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e959518 // sdot v24.4s, v8.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e85962f // sdot v15.4s, v17.16b, v5.16b\n"
+ "ldr q19, [x23, x13]\n"
+ ".inst 0x4e959624 // sdot v4.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e819514 // sdot v20.4s, v8.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v24.4s, v24.4s, v6.4s\n"
+ ".inst 0x4e95950f // sdot v15.4s, v8.16b, v21.16b\n"
+ ".inst 0x4e819504 // sdot v4.4s, v8.16b, v1.16b\n"
+ "and v16.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v6.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v5.4s\n"
- "sqrdmulh v25.4s, v25.4s, v5.4s\n"
- "sqrdmulh v23.4s, v23.4s, v5.4s\n"
- "ldr q19, [%x[params], #0xc0]\n"
+ "sqrdmulh v15.4s, v15.4s, v6.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v6.4s\n"
+ "ldr q25, [%x[params], #0xc0]\n"
+ "and v18.16b, v20.16b, v12.16b\n"
"sqadd v24.4s, v24.4s, v16.4s\n"
- "and v18.16b, v26.16b, v22.16b\n"
- "and v17.16b, v25.16b, v22.16b\n"
- "and v16.16b, v23.16b, v22.16b\n"
+ "and v17.16b, v15.16b, v12.16b\n"
+ "and v16.16b, v4.16b, v12.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v24.4s, v24.4s, v22.4s\n"
- "sqadd v26.4s, v26.4s, v18.4s\n"
- "ldr q18, [%x[params], #0xa0]\n"
- "sqadd v25.4s, v25.4s, v17.4s\n"
- "ldr q17, [%x[params], #0xb0]\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xb0]\n"
+ "sqadd v15.4s, v15.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
"ldr q16, [%x[params], #0x90]\n"
- "add v24.4s, v24.4s, v15.4s\n"
- "srshl v26.4s, v26.4s, v22.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "srshl v23.4s, v23.4s, v22.4s\n"
- "ldr q22, [%x[params], #0xd0]\n"
- "smax v24.4s, v24.4s, v8.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "smin v24.4s, v24.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v25.4s, v25.4s, v8.4s\n"
- "smax v23.4s, v23.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smin v23.4s, v23.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v12.4s\n"
+ "smax v24.4s, v24.4s, v27.4s\n"
+ "srshl v4.4s, v4.4s, v12.4s\n"
+ "ldr q8, [%x[params], #0xd0]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "add v15.4s, v15.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "add v4.4s, v4.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v27.4s\n"
+ "smax v15.4s, v15.4s, v27.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
+ "smax v4.4s, v4.4s, v27.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v15.4s, v15.4s, v11.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x25, x11]\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str s24, [x11, x12]\n"
"ldr q24, [%x[params], #0x80]\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s26, [x24, x11]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s25, [x23, x11]\n"
- "str s23, [x22, x11]\n"
- "mov v23.16b, v24.16b\n"
- "mov v31.16b, v24.16b\n"
- ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n"
- "mov v13.16b, v24.16b\n"
- ".inst 0x4e839618 // sdot v24.4s, v16.16b, v3.16b\n"
- ".inst 0x4e959658 // sdot v24.4s, v18.16b, v21.16b\n"
- "add x11, x11, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e839617 // sdot v23.4s, v16.16b, v3.16b\n"
- "ldr q3, [x10, x12]\n"
- ".inst 0x4e95960d // sdot v13.4s, v16.16b, v21.16b\n"
- ".inst 0x4e80965f // sdot v31.4s, v18.16b, v0.16b\n"
- ".inst 0x4e809638 // sdot v24.4s, v17.16b, v0.16b\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e959657 // sdot v23.4s, v18.16b, v21.16b\n"
- "ldr q4, [x26, x12]\n"
- ".inst 0x4e80964d // sdot v13.4s, v18.16b, v0.16b\n"
- ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "sqrdmulh v24.4s, v24.4s, v19.4s\n"
- ".inst 0x4e809637 // sdot v23.4s, v17.16b, v0.16b\n"
- ".inst 0x4e82962d // sdot v13.4s, v17.16b, v2.16b\n"
- "and v16.16b, v24.16b, v22.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s15, [x10, x12]\n"
+ "mov v21.16b, v24.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v20.16b, v24.16b\n"
+ "str s4, [x28, x12]\n"
+ "mov v12.16b, v24.16b\n"
+ ".inst 0x4e9e9618 // sdot v24.4s, v16.16b, v30.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e879614 // sdot v20.4s, v16.16b, v7.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e879638 // sdot v24.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9e9615 // sdot v21.4s, v16.16b, v30.16b\n"
+ "ldr q30, [x26, x13]\n"
+ ".inst 0x4e87960c // sdot v12.4s, v16.16b, v7.16b\n"
+ ".inst 0x4e9d9634 // sdot v20.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9d9658 // sdot v24.4s, v18.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
+ "ldr q1, [x22, x13]\n"
+ ".inst 0x4e9d962c // sdot v12.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9f9654 // sdot v20.4s, v18.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "sqrdmulh v24.4s, v24.4s, v25.4s\n"
+ ".inst 0x4e9d9655 // sdot v21.4s, v18.16b, v29.16b\n"
+ ".inst 0x4e9f964c // sdot v12.4s, v18.16b, v31.16b\n"
+ "and v16.16b, v24.16b, v8.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v25.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v23.4s, v23.4s, v19.4s\n"
- "sqrdmulh v31.4s, v31.4s, v19.4s\n"
- "sqrdmulh v13.4s, v13.4s, v19.4s\n"
- "ldr q19, [%x[params], #0x120]\n"
+ "sqrdmulh v21.4s, v21.4s, v25.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v25.4s\n"
+ "ldr q29, [%x[params], #0x120]\n"
+ "and v18.16b, v20.16b, v8.16b\n"
"sqadd v24.4s, v24.4s, v16.4s\n"
- "and v18.16b, v23.16b, v22.16b\n"
- "and v17.16b, v31.16b, v22.16b\n"
- "and v16.16b, v13.16b, v22.16b\n"
+ "and v17.16b, v21.16b, v8.16b\n"
+ "and v16.16b, v12.16b, v8.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v24.4s, v24.4s, v22.4s\n"
- "sqadd v23.4s, v23.4s, v18.4s\n"
- "ldr q18, [%x[params], #0x100]\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "ldr q17, [%x[params], #0x110]\n"
- "sqadd v13.4s, v13.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x110]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "sqadd v12.4s, v12.4s, v16.4s\n"
"ldr q16, [%x[params], #0xf0]\n"
- "add v24.4s, v24.4s, v15.4s\n"
- "srshl v23.4s, v23.4s, v22.4s\n"
- "srshl v31.4s, v31.4s, v22.4s\n"
- "srshl v13.4s, v13.4s, v22.4s\n"
- "ldr q22, [%x[params], #0x130]\n"
- "smax v24.4s, v24.4s, v8.4s\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "add v31.4s, v31.4s, v15.4s\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "smin v24.4s, v24.4s, v12.4s\n"
- "smax v23.4s, v23.4s, v8.4s\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "smax v13.4s, v13.4s, v8.4s\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v27.4s\n"
+ "srshl v12.4s, v12.4s, v8.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "add v12.4s, v12.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v27.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
+ "smax v12.4s, v12.4s, v27.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str s24, [x25, x11]\n"
- "ldr q2, [%x[params], #0xe0]\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s23, [x24, x11]\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str s31, [x23, x11]\n"
- "mov v25.16b, v2.16b\n"
- "str s13, [x22, x11]\n"
- "mov v21.16b, v2.16b\n"
- "mov v30.16b, v2.16b\n"
- ".inst 0x4e8e9602 // sdot v2.4s, v16.16b, v14.16b\n"
- ".inst 0x4e9b9615 // sdot v21.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b9642 // sdot v2.4s, v18.16b, v27.16b\n"
+ "smin v12.4s, v12.4s, v11.4s\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s24, [x11, x12]\n"
+ "ldr q26, [%x[params], #0xe0]\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "str s21, [x10, x12]\n"
+ "mov v5.16b, v26.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v21.16b, v26.16b\n"
+ "str s12, [x28, x12]\n"
+ "mov v0.16b, v26.16b\n"
+ ".inst 0x4e8e961a // sdot v26.4s, v16.16b, v14.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e9c9615 // sdot v21.4s, v16.16b, v28.16b\n"
"ext v14.16b, v14.16b, v14.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e8e9619 // sdot v25.4s, v16.16b, v14.16b\n"
- "ldr q14, [x9, x12]\n"
- ".inst 0x4e9b961e // sdot v30.4s, v16.16b, v27.16b\n"
- ".inst 0x4e879655 // sdot v21.4s, v18.16b, v7.16b\n"
- ".inst 0x4e879622 // sdot v2.4s, v17.16b, v7.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e9b9659 // sdot v25.4s, v18.16b, v27.16b\n"
- "ldr q27, [x21, x12]\n"
- ".inst 0x4e87965e // sdot v30.4s, v18.16b, v7.16b\n"
- "sqrdmulh v2.4s, v2.4s, v19.4s\n"
- ".inst 0x4e819635 // sdot v21.4s, v17.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e879639 // sdot v25.4s, v17.16b, v7.16b\n"
- ".inst 0x4e81963e // sdot v30.4s, v17.16b, v1.16b\n"
- "and v16.16b, v2.16b, v22.16b\n"
+ ".inst 0x4e9c963a // sdot v26.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8e9605 // sdot v5.4s, v16.16b, v14.16b\n"
+ "ldr q14, [x25, x13]\n"
+ ".inst 0x4e9c9600 // sdot v0.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e969635 // sdot v21.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e96965a // sdot v26.4s, v18.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9c9625 // sdot v5.4s, v17.16b, v28.16b\n"
+ "ldr q28, [x21, x13]\n"
+ ".inst 0x4e969620 // sdot v0.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e829655 // sdot v21.4s, v18.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v26.4s, v26.4s, v29.4s\n"
+ ".inst 0x4e969645 // sdot v5.4s, v18.16b, v22.16b\n"
+ ".inst 0x4e829640 // sdot v0.4s, v18.16b, v2.16b\n"
+ "and v16.16b, v26.16b, v23.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v25.4s, v25.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqrdmulh v30.4s, v30.4s, v19.4s\n"
- "ldr q11, [x14, x12]\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q5, [x21, x12]\n"
- "ldr q29, [x20, x12]\n"
- "sqadd v2.4s, v2.4s, v16.4s\n"
- "and v19.16b, v25.16b, v22.16b\n"
- "and v17.16b, v21.16b, v22.16b\n"
- "and v16.16b, v30.16b, v22.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v29.4s\n"
+ "ldr q12, [x14, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "and v20.16b, v21.16b, v23.16b\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
- "ldr q26, [x21, x12]\n"
- "ldr q7, [x20, x12]\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "and v17.16b, v5.16b, v23.16b\n"
+ "ldr q8, [x23, x13]\n"
+ "ldr q29, [x22, x13]\n"
+ "and v16.16b, v0.16b, v23.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "ldr q18, [x21, x13]\n"
+ "ldr q22, [x20, x13]\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v23.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v2.4s, v2.4s, v22.4s\n"
- "sqadd v25.4s, v25.4s, v19.4s\n"
- "ldr q9, [%x[params], #0x160]\n"
- "sqadd v21.4s, v21.4s, v17.4s\n"
- "ldr q6, [%x[params], #0x170]\n"
- "sqadd v30.4s, v30.4s, v16.4s\n"
- "ldr q24, [%x[params], #0x150]\n"
- "add v2.4s, v2.4s, v15.4s\n"
- "srshl v25.4s, v25.4s, v22.4s\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "srshl v30.4s, v30.4s, v22.4s\n"
- "ldr q13, [x28, x12]\n"
- "smax v2.4s, v2.4s, v8.4s\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q16, [x21, x12]\n"
- "ldr q28, [x20, x12]\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "add v30.4s, v30.4s, v15.4s\n"
- "smin v2.4s, v2.4s, v12.4s\n"
+ "sqadd v21.4s, v21.4s, v20.4s\n"
+ "ldr q10, [%x[params], #0x170]\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "sqadd v0.4s, v0.4s, v16.4s\n"
+ "ldr q4, [%x[params], #0x150]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v5.4s, v5.4s, v23.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "ldr q15, [x24, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "add v21.4s, v21.4s, v13.4s\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "ldr q23, [x21, x12]\n"
- "ldr q1, [x20, x12]\n"
- "smax v25.4s, v25.4s, v8.4s\n"
- "smax v21.4s, v21.4s, v8.4s\n"
- "ldp x14, x13, [%x[inptrs], #0x0]\n"
- "smax v30.4s, v30.4s, v8.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "ldp x10, x9, [%x[inptrs], #0x10]\n"
- "ldp x28, x27, [%x[inptrs], #0x20]\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "smin v30.4s, v30.4s, v12.4s\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v2.16b, v2.16b, v2.16b\n"
- "str s2, [x25, x11]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "ldp x14, x27, [%x[inptrs], #0x0]\n"
+ "add v5.4s, v5.4s, v13.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "ldr q9, [x23, x13]\n"
+ "ldr q24, [x22, x13]\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "ldp x24, x23, [%x[inptrs], #0x20]\n"
+ "ldr q25, [x21, x13]\n"
+ "ldr q2, [x20, x13]\n"
+ "smax v5.4s, v5.4s, v27.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "ldp x22, x21, [%x[inptrs], #0x30]\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v5.4s, v5.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "zip2 v18.16b, v11.16b, v3.16b\n"
- "zip1 v11.16b, v11.16b, v3.16b\n"
- "zip1 v17.16b, v10.16b, v14.16b\n"
- "zip2 v14.16b, v10.16b, v14.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x24, x11]\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str s26, [x11, x12]\n"
+ "zip2 v17.16b, v12.16b, v30.16b\n"
+ "zip1 v12.16b, v12.16b, v30.16b\n"
+ "zip1 v16.16b, v3.16b, v14.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "zip2 v14.16b, v3.16b, v14.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s21, [x23, x11]\n"
- "str s30, [x22, x11]\n"
- "zip2 v10.16b, v11.16b, v17.16b\n"
- "zip1 v11.16b, v11.16b, v17.16b\n"
- "add x11, x11, #0x4\n"
- "zip1 v3.16b, v18.16b, v14.16b\n"
- "zip2 v14.16b, v18.16b, v14.16b\n"
- "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v23.16b, v12.16b, v16.16b\n"
+ "zip1 v12.16b, v12.16b, v16.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "zip1 v30.16b, v17.16b, v14.16b\n"
+ "str s5, [x10, x12]\n"
+ "str s21, [x9, x12]\n"
+ "zip2 v14.16b, v17.16b, v14.16b\n"
+ "ldr q3, [%x[params], #0x140]\n"
+ "zip2 v21.16b, v15.16b, v1.16b\n"
+ "zip1 v15.16b, v15.16b, v1.16b\n"
+ "zip1 v20.16b, v19.16b, v28.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip2 v22.16b, v13.16b, v4.16b\n"
- "zip1 v13.16b, v13.16b, v4.16b\n"
- "zip1 v2.16b, v20.16b, v27.16b\n"
- "zip2 v27.16b, v20.16b, v27.16b\n"
- "zip2 v19.16b, v5.16b, v26.16b\n"
- "zip1 v5.16b, v5.16b, v26.16b\n"
- "zip1 v18.16b, v29.16b, v7.16b\n"
- "zip2 v7.16b, v29.16b, v7.16b\n"
- "zip2 v4.16b, v16.16b, v23.16b\n"
- "zip1 v16.16b, v16.16b, v23.16b\n"
- "zip1 v17.16b, v28.16b, v1.16b\n"
- "zip2 v1.16b, v28.16b, v1.16b\n"
- "zip2 v28.16b, v13.16b, v2.16b\n"
- "zip1 v13.16b, v13.16b, v2.16b\n"
- "zip1 v21.16b, v22.16b, v27.16b\n"
- "zip2 v27.16b, v22.16b, v27.16b\n"
- "zip2 v29.16b, v5.16b, v18.16b\n"
- "zip1 v5.16b, v5.16b, v18.16b\n"
- "zip1 v0.16b, v19.16b, v7.16b\n"
- "zip2 v7.16b, v19.16b, v7.16b\n"
- "zip2 v30.16b, v16.16b, v17.16b\n"
- "zip1 v16.16b, v16.16b, v17.16b\n"
- "zip1 v2.16b, v4.16b, v1.16b\n"
- "zip2 v1.16b, v4.16b, v1.16b\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- "mov v4.16b, v31.16b\n"
+ "str s0, [x28, x12]\n"
+ "zip2 v28.16b, v19.16b, v28.16b\n"
+ "zip2 v19.16b, v8.16b, v18.16b\n"
+ "add x12, x12, #0x4\n"
+ "zip1 v8.16b, v8.16b, v18.16b\n"
+ "zip1 v18.16b, v29.16b, v22.16b\n"
+ "zip2 v22.16b, v29.16b, v22.16b\n"
+ "zip2 v17.16b, v9.16b, v25.16b\n"
+ "zip1 v9.16b, v9.16b, v25.16b\n"
+ "zip1 v16.16b, v24.16b, v2.16b\n"
+ "zip2 v2.16b, v24.16b, v2.16b\n"
+ "zip2 v5.16b, v15.16b, v20.16b\n"
+ "zip1 v15.16b, v15.16b, v20.16b\n"
+ "zip1 v7.16b, v21.16b, v28.16b\n"
+ "zip2 v28.16b, v21.16b, v28.16b\n"
+ "zip2 v21.16b, v8.16b, v18.16b\n"
+ "zip1 v8.16b, v8.16b, v18.16b\n"
+ "zip1 v29.16b, v19.16b, v22.16b\n"
+ "zip2 v22.16b, v19.16b, v22.16b\n"
+ "zip2 v1.16b, v9.16b, v16.16b\n"
+ "zip1 v9.16b, v9.16b, v16.16b\n"
+ "zip1 v31.16b, v17.16b, v2.16b\n"
+ "zip2 v2.16b, v17.16b, v2.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
- ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
- "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ ".inst 0x4e8c9483 // sdot v3.4s, v4.16b, v12.16b\n"
+ ".inst 0x4e8f9480 // sdot v0.4s, v4.16b, v15.16b\n"
+ "ext v12.16b, v12.16b, v12.16b, #0x1\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
- ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "add x13, x13, #0x10\n"
+ ".inst 0x4e8c949a // sdot v26.4s, v4.16b, v12.16b\n"
"ldr q17, [%x[params], #0x0]\n"
- ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
- ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
- "add x12, x12, #0x10\n"
- ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ ".inst 0x4e8f94c3 // sdot v3.4s, v6.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e8894c0 // sdot v0.4s, v6.16b, v8.16b\n"
+ ".inst 0x4e8f9492 // sdot v18.4s, v4.16b, v15.16b\n"
+ ".inst 0x4e889543 // sdot v3.4s, v10.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8f94da // sdot v26.4s, v6.16b, v15.16b\n"
"ldr q19, [%x[params], #0x10]\n"
- ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
- ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
- ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
- "and v16.16b, v31.16b, v19.16b\n"
+ ".inst 0x4e8894d2 // sdot v18.4s, v6.16b, v8.16b\n"
+ ".inst 0x4e899540 // sdot v0.4s, v10.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ ".inst 0x4e88955a // sdot v26.4s, v10.16b, v8.16b\n"
+ ".inst 0x4e899552 // sdot v18.4s, v10.16b, v9.16b\n"
+ "and v16.16b, v3.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "ldr q24, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v20.16b, v26.16b, v19.16b\n"
- "and v17.16b, v18.16b, v19.16b\n"
- "and v16.16b, v4.16b, v19.16b\n"
+ "ldr q4, [%x[params], #0x60]\n"
+ "and v20.16b, v0.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v19.16b\n"
+ "and v16.16b, v18.16b, v19.16b\n"
"sshr v20.4s, v20.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v19.4s\n"
- "sqadd v26.4s, v26.4s, v20.4s\n"
- "ldr q5, [%x[params], #0x40]\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "ldr q17, [%x[params], #0x50]\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "ldr q6, [%x[params], #0x50]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
"ldr q16, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v0.4s, v0.4s, v19.4s\n"
"srshl v26.4s, v26.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
"srshl v18.4s, v18.4s, v19.4s\n"
- "srshl v4.4s, v4.4s, v19.4s\n"
- "ldr q23, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "ldr q19, [%x[params], #0x70]\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s31, [x25, x11]\n"
+ "str s3, [x11, x12]\n"
"ldr q25, [%x[params], #0x20]\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s26, [x24, x11]\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s18, [x23, x11]\n"
- "mov v22.16b, v25.16b\n"
- "str s4, [x22, x11]\n"
+ "str s26, [x10, x12]\n"
+ "mov v24.16b, v25.16b\n"
+ "str s0, [x9, x12]\n"
"mov v20.16b, v25.16b\n"
- "mov v19.16b, v25.16b\n"
- ".inst 0x4e8a9619 // sdot v25.4s, v16.16b, v10.16b\n"
- ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9c94b9 // sdot v25.4s, v5.16b, v28.16b\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e8a9616 // sdot v22.4s, v16.16b, v10.16b\n"
- ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n"
- ".inst 0x4e9d94b4 // sdot v20.4s, v5.16b, v29.16b\n"
- ".inst 0x4e9d9639 // sdot v25.4s, v17.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e9c94b6 // sdot v22.4s, v5.16b, v28.16b\n"
- ".inst 0x4e9d94b3 // sdot v19.4s, v5.16b, v29.16b\n"
- "sqrdmulh v25.4s, v25.4s, v24.4s\n"
- ".inst 0x4e9e9634 // sdot v20.4s, v17.16b, v30.16b\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- ".inst 0x4e9d9636 // sdot v22.4s, v17.16b, v29.16b\n"
- ".inst 0x4e9e9633 // sdot v19.4s, v17.16b, v30.16b\n"
- "and v16.16b, v25.16b, v23.16b\n"
+ "str s18, [x28, x12]\n"
+ "mov v3.16b, v25.16b\n"
+ ".inst 0x4e979619 // sdot v25.4s, v16.16b, v23.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e859614 // sdot v20.4s, v16.16b, v5.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e859639 // sdot v25.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e979618 // sdot v24.4s, v16.16b, v23.16b\n"
+ ".inst 0x4e859603 // sdot v3.4s, v16.16b, v5.16b\n"
+ ".inst 0x4e959634 // sdot v20.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e9594d9 // sdot v25.4s, v6.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e859638 // sdot v24.4s, v17.16b, v5.16b\n"
+ ".inst 0x4e959623 // sdot v3.4s, v17.16b, v21.16b\n"
+ ".inst 0x4e8194d4 // sdot v20.4s, v6.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ ".inst 0x4e9594d8 // sdot v24.4s, v6.16b, v21.16b\n"
+ ".inst 0x4e8194c3 // sdot v3.4s, v6.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v19.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v24.4s\n"
- "sqrdmulh v20.4s, v20.4s, v24.4s\n"
- "sqrdmulh v19.4s, v19.4s, v24.4s\n"
- "ldr q24, [%x[params], #0xc0]\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v4.4s\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "and v18.16b, v20.16b, v19.16b\n"
"sqadd v25.4s, v25.4s, v16.4s\n"
- "and v18.16b, v22.16b, v23.16b\n"
- "and v17.16b, v20.16b, v23.16b\n"
- "and v16.16b, v19.16b, v23.16b\n"
+ "and v17.16b, v24.16b, v19.16b\n"
+ "and v16.16b, v3.16b, v19.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v19.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v25.4s, v25.4s, v23.4s\n"
- "sqadd v22.4s, v22.4s, v18.4s\n"
- "ldr q18, [%x[params], #0xa0]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q17, [%x[params], #0xb0]\n"
- "sqadd v19.4s, v19.4s, v16.4s\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xb0]\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
"ldr q16, [%x[params], #0x90]\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "srshl v22.4s, v22.4s, v23.4s\n"
- "srshl v20.4s, v20.4s, v23.4s\n"
- "srshl v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0xd0]\n"
- "smax v25.4s, v25.4s, v8.4s\n"
- "add v22.4s, v22.4s, v15.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "smax v22.4s, v22.4s, v8.4s\n"
- "smax v20.4s, v20.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v8.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
+ "srshl v24.4s, v24.4s, v19.4s\n"
+ "smax v25.4s, v25.4s, v27.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "ldr q5, [%x[params], #0xd0]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v27.4s\n"
+ "smax v24.4s, v24.4s, v27.4s\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s25, [x25, x11]\n"
- "ldr q10, [%x[params], #0x80]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s22, [x24, x11]\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s25, [x11, x12]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s20, [x23, x11]\n"
- "str s19, [x22, x11]\n"
- "mov v28.16b, v10.16b\n"
- "mov v20.16b, v10.16b\n"
- ".inst 0x4e959614 // sdot v20.4s, v16.16b, v21.16b\n"
- "mov v19.16b, v10.16b\n"
- ".inst 0x4e83960a // sdot v10.4s, v16.16b, v3.16b\n"
- ".inst 0x4e95964a // sdot v10.4s, v18.16b, v21.16b\n"
- "add x11, x11, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e83961c // sdot v28.4s, v16.16b, v3.16b\n"
- ".inst 0x4e959613 // sdot v19.4s, v16.16b, v21.16b\n"
- ".inst 0x4e809654 // sdot v20.4s, v18.16b, v0.16b\n"
- ".inst 0x4e80962a // sdot v10.4s, v17.16b, v0.16b\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e95965c // sdot v28.4s, v18.16b, v21.16b\n"
- ".inst 0x4e809653 // sdot v19.4s, v18.16b, v0.16b\n"
- ".inst 0x4e829634 // sdot v20.4s, v17.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "sqrdmulh v10.4s, v10.4s, v24.4s\n"
- ".inst 0x4e80963c // sdot v28.4s, v17.16b, v0.16b\n"
- ".inst 0x4e829633 // sdot v19.4s, v17.16b, v2.16b\n"
- "and v16.16b, v10.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str s24, [x10, x12]\n"
+ "mov v26.16b, v21.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v20.16b, v21.16b\n"
+ "str s3, [x28, x12]\n"
+ "mov v19.16b, v21.16b\n"
+ ".inst 0x4e9e9615 // sdot v21.4s, v16.16b, v30.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e879614 // sdot v20.4s, v16.16b, v7.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9e961a // sdot v26.4s, v16.16b, v30.16b\n"
+ ".inst 0x4e879613 // sdot v19.4s, v16.16b, v7.16b\n"
+ ".inst 0x4e9d9634 // sdot v20.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9d9655 // sdot v21.4s, v18.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e87963a // sdot v26.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e9d9633 // sdot v19.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9f9654 // sdot v20.4s, v18.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ ".inst 0x4e9d965a // sdot v26.4s, v18.16b, v29.16b\n"
+ ".inst 0x4e9f9653 // sdot v19.4s, v18.16b, v31.16b\n"
+ "and v16.16b, v21.16b, v5.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v24.4s\n"
- "sqrdmulh v20.4s, v20.4s, v24.4s\n"
- "sqrdmulh v19.4s, v19.4s, v24.4s\n"
- "ldr q24, [%x[params], #0x120]\n"
- "sqadd v10.4s, v10.4s, v16.4s\n"
- "and v18.16b, v28.16b, v23.16b\n"
- "and v17.16b, v20.16b, v23.16b\n"
- "and v16.16b, v19.16b, v23.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v23.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "ldr q25, [%x[params], #0x120]\n"
+ "and v18.16b, v20.16b, v5.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v5.16b\n"
+ "and v16.16b, v19.16b, v5.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v10.4s, v10.4s, v23.4s\n"
- "sqadd v28.4s, v28.4s, v18.4s\n"
- "ldr q18, [%x[params], #0x100]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x110]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v21.4s, v21.4s, v13.4s\n"
"sqadd v19.4s, v19.4s, v16.4s\n"
"ldr q16, [%x[params], #0xf0]\n"
- "add v10.4s, v10.4s, v15.4s\n"
- "srshl v28.4s, v28.4s, v23.4s\n"
- "srshl v20.4s, v20.4s, v23.4s\n"
- "srshl v19.4s, v19.4s, v23.4s\n"
- "ldr q23, [%x[params], #0x130]\n"
- "smax v10.4s, v10.4s, v8.4s\n"
- "add v28.4s, v28.4s, v15.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "smax v28.4s, v28.4s, v8.4s\n"
- "smax v20.4s, v20.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v8.4s\n"
- "smin v28.4s, v28.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s10, [x25, x11]\n"
- "ldr q22, [%x[params], #0xe0]\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "ldr q24, [%x[params], #0x130]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x11, x12]\n"
+ "ldr q23, [%x[params], #0xe0]\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s28, [x24, x11]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s20, [x23, x11]\n"
- "mov v21.16b, v22.16b\n"
- "str s19, [x22, x11]\n"
- "mov v20.16b, v22.16b\n"
- "mov v19.16b, v22.16b\n"
- ".inst 0x4e8e9616 // sdot v22.4s, v16.16b, v14.16b\n"
- ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
- ".inst 0x4e9b9656 // sdot v22.4s, v18.16b, v27.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v21.16b, v23.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v20.16b, v23.16b\n"
+ "str s19, [x28, x12]\n"
+ "mov v19.16b, v23.16b\n"
+ ".inst 0x4e8e9617 // sdot v23.4s, v16.16b, v14.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x4e9c9615 // sdot v21.4s, v16.16b, v28.16b\n"
"ext v14.16b, v14.16b, v14.16b, #0x1\n"
- "add x11, x11, #0x4\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x4e8e9615 // sdot v21.4s, v16.16b, v14.16b\n"
- ".inst 0x4e9b9613 // sdot v19.4s, v16.16b, v27.16b\n"
- ".inst 0x4e879654 // sdot v20.4s, v18.16b, v7.16b\n"
- ".inst 0x4e879636 // sdot v22.4s, v17.16b, v7.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n"
- ".inst 0x4e879653 // sdot v19.4s, v18.16b, v7.16b\n"
- "sqrdmulh v22.4s, v22.4s, v24.4s\n"
- ".inst 0x4e819634 // sdot v20.4s, v17.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
- ".inst 0x4e819633 // sdot v19.4s, v17.16b, v1.16b\n"
- "and v16.16b, v22.16b, v23.16b\n"
+ ".inst 0x4e9c9637 // sdot v23.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8e9614 // sdot v20.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e969635 // sdot v21.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e969657 // sdot v23.4s, v18.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9c9634 // sdot v20.4s, v17.16b, v28.16b\n"
+ ".inst 0x4e969633 // sdot v19.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e829655 // sdot v21.4s, v18.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v23.4s, v23.4s, v25.4s\n"
+ ".inst 0x4e969654 // sdot v20.4s, v18.16b, v22.16b\n"
+ ".inst 0x4e829653 // sdot v19.4s, v18.16b, v2.16b\n"
+ "and v16.16b, v23.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v25.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v24.4s\n"
- "sqrdmulh v20.4s, v20.4s, v24.4s\n"
- "sqrdmulh v19.4s, v19.4s, v24.4s\n"
- "sqadd v22.4s, v22.4s, v16.4s\n"
- "and v18.16b, v21.16b, v23.16b\n"
- "and v17.16b, v20.16b, v23.16b\n"
- "and v16.16b, v19.16b, v23.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v25.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v25.4s\n"
+ "and v18.16b, v21.16b, v24.16b\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "and v17.16b, v20.16b, v24.16b\n"
+ "and v16.16b, v19.16b, v24.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v23.4s, v23.4s, v24.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v21.4s, v21.4s, v18.4s\n"
"sqadd v20.4s, v20.4s, v17.4s\n"
+ "add v23.4s, v23.4s, v13.4s\n"
"sqadd v19.4s, v19.4s, v16.4s\n"
- "srshl v22.4s, v22.4s, v23.4s\n"
- "srshl v21.4s, v21.4s, v23.4s\n"
- "srshl v20.4s, v20.4s, v23.4s\n"
- "srshl v19.4s, v19.4s, v23.4s\n"
- "add v22.4s, v22.4s, v15.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "smax v22.4s, v22.4s, v8.4s\n"
- "smax v21.4s, v21.4s, v8.4s\n"
- "smax v20.4s, v20.4s, v8.4s\n"
- "smax v19.4s, v19.4s, v8.4s\n"
- "smin v22.4s, v22.4s, v12.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "srshl v21.4s, v21.4s, v24.4s\n"
+ "srshl v20.4s, v20.4s, v24.4s\n"
+ "smax v23.4s, v23.4s, v27.4s\n"
+ "srshl v19.4s, v19.4s, v24.4s\n"
+ "add v21.4s, v21.4s, v13.4s\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v27.4s\n"
+ "smax v20.4s, v20.4s, v27.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v19.4s, v19.4s, v27.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s23, [x11, x12]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s22, [x25, x11]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str s21, [x24, x11]\n"
- "str s20, [x23, x11]\n"
- "str s19, [x22, x11]\n"
- "add x11, x11, #0x4\n"
+ "str s20, [x10, x12]\n"
+ "str s21, [x9, x12]\n"
+ "str s19, [x28, x12]\n"
+ "add x12, x12, #0x4\n"
"beq 35f\n"
"3:" // Oddments
"and x20, %x[n_channels], #0xf\n"
- "add x14, x14, x12\n"
- "add x13, x13, x12\n"
- "add x10, x10, x12\n"
- "add x9, x9, x12\n"
- "add x28, x28, x12\n"
- "add x27, x27, x12\n"
- "add x26, x26, x12\n"
- "add x21, x21, x12\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d11, [x14], #0x8\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d3, [x10], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d13, [x28], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d21, [x26], #0x8\n"
- "ldr d27, [x21], #0x8\n"
+ "ldr d12, [x14], #0x8\n"
+ "ldr d23, [x27], #0x8\n"
+ "ldr d30, [x26], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d7, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v11.s }[2], [x14], #0x4\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v3.s }[2], [x10], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "ld1 { v13.s }[2], [x28], #0x4\n"
- "ld1 { v28.s }[2], [x27], #0x4\n"
- "ld1 { v21.s }[2], [x26], #0x4\n"
- "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x14], #0x4\n"
+ "ld1 { v23.s }[2], [x27], #0x4\n"
+ "ld1 { v30.s }[2], [x26], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v7.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v11.h }[6], [x14], #0x2\n"
- "ld1 { v10.h }[6], [x13], #0x2\n"
- "ld1 { v3.h }[6], [x10], #0x2\n"
- "ld1 { v14.h }[6], [x9], #0x2\n"
- "ld1 { v13.h }[6], [x28], #0x2\n"
- "ld1 { v28.h }[6], [x27], #0x2\n"
- "ld1 { v21.h }[6], [x26], #0x2\n"
- "ld1 { v27.h }[6], [x21], #0x2\n"
+ "ld1 { v12.h }[6], [x14], #0x2\n"
+ "ld1 { v23.h }[6], [x27], #0x2\n"
+ "ld1 { v30.h }[6], [x26], #0x2\n"
+ "ld1 { v14.h }[6], [x25], #0x2\n"
+ "ld1 { v15.h }[6], [x24], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v7.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[14], [x14], #0x1\n"
- "ld1 { v10.b }[14], [x13], #0x1\n"
- "ld1 { v3.b }[14], [x10], #0x1\n"
- "ld1 { v14.b }[14], [x9], #0x1\n"
- "ld1 { v13.b }[14], [x28], #0x1\n"
- "ld1 { v28.b }[14], [x27], #0x1\n"
- "ld1 { v21.b }[14], [x26], #0x1\n"
- "ld1 { v27.b }[14], [x21], #0x1\n"
+ "ld1 { v12.b }[14], [x14], #0x1\n"
+ "ld1 { v23.b }[14], [x27], #0x1\n"
+ "ld1 { v30.b }[14], [x26], #0x1\n"
+ "ld1 { v14.b }[14], [x25], #0x1\n"
+ "ld1 { v15.b }[14], [x24], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
+ "ld1 { v7.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[12], [x14], #0x1\n"
- "ld1 { v10.b }[12], [x13], #0x1\n"
- "ld1 { v3.b }[12], [x10], #0x1\n"
- "ld1 { v14.b }[12], [x9], #0x1\n"
- "ld1 { v13.b }[12], [x28], #0x1\n"
- "ld1 { v28.b }[12], [x27], #0x1\n"
- "ld1 { v21.b }[12], [x26], #0x1\n"
- "ld1 { v27.b }[12], [x21], #0x1\n"
+ "ld1 { v12.b }[12], [x14], #0x1\n"
+ "ld1 { v23.b }[12], [x27], #0x1\n"
+ "ld1 { v30.b }[12], [x26], #0x1\n"
+ "ld1 { v14.b }[12], [x25], #0x1\n"
+ "ld1 { v15.b }[12], [x24], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
+ "ld1 { v7.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v11.h }[4], [x14], #0x2\n"
- "ld1 { v10.h }[4], [x13], #0x2\n"
- "ld1 { v3.h }[4], [x10], #0x2\n"
- "ld1 { v14.h }[4], [x9], #0x2\n"
- "ld1 { v13.h }[4], [x28], #0x2\n"
- "ld1 { v28.h }[4], [x27], #0x2\n"
- "ld1 { v21.h }[4], [x26], #0x2\n"
- "ld1 { v27.h }[4], [x21], #0x2\n"
+ "ld1 { v12.h }[4], [x14], #0x2\n"
+ "ld1 { v23.h }[4], [x27], #0x2\n"
+ "ld1 { v30.h }[4], [x26], #0x2\n"
+ "ld1 { v14.h }[4], [x25], #0x2\n"
+ "ld1 { v15.h }[4], [x24], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v7.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[10], [x14], #0x1\n"
- "ld1 { v10.b }[10], [x13], #0x1\n"
- "ld1 { v3.b }[10], [x10], #0x1\n"
- "ld1 { v14.b }[10], [x9], #0x1\n"
- "ld1 { v13.b }[10], [x28], #0x1\n"
- "ld1 { v28.b }[10], [x27], #0x1\n"
- "ld1 { v21.b }[10], [x26], #0x1\n"
- "ld1 { v27.b }[10], [x21], #0x1\n"
+ "ld1 { v12.b }[10], [x14], #0x1\n"
+ "ld1 { v23.b }[10], [x27], #0x1\n"
+ "ld1 { v30.b }[10], [x26], #0x1\n"
+ "ld1 { v14.b }[10], [x25], #0x1\n"
+ "ld1 { v15.b }[10], [x24], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
+ "ld1 { v7.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[8], [x14], #0x1\n"
- "ld1 { v10.b }[8], [x13], #0x1\n"
- "ld1 { v3.b }[8], [x10], #0x1\n"
- "ld1 { v14.b }[8], [x9], #0x1\n"
- "ld1 { v13.b }[8], [x28], #0x1\n"
- "ld1 { v28.b }[8], [x27], #0x1\n"
- "ld1 { v21.b }[8], [x26], #0x1\n"
- "ld1 { v27.b }[8], [x21], #0x1\n"
+ "ld1 { v12.b }[8], [x14], #0x1\n"
+ "ld1 { v23.b }[8], [x27], #0x1\n"
+ "ld1 { v30.b }[8], [x26], #0x1\n"
+ "ld1 { v14.b }[8], [x25], #0x1\n"
+ "ld1 { v15.b }[8], [x24], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
+ "ld1 { v7.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s11, [x14], #0x4\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s3, [x10], #0x4\n"
- "ldr s14, [x9], #0x4\n"
- "ldr s13, [x28], #0x4\n"
- "ldr s28, [x27], #0x4\n"
- "ldr s21, [x26], #0x4\n"
- "ldr s27, [x21], #0x4\n"
+ "ldr s12, [x14], #0x4\n"
+ "ldr s23, [x27], #0x4\n"
+ "ldr s30, [x26], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s7, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v11.h }[2], [x14], #0x2\n"
- "ld1 { v10.h }[2], [x13], #0x2\n"
- "ld1 { v3.h }[2], [x10], #0x2\n"
- "ld1 { v14.h }[2], [x9], #0x2\n"
- "ld1 { v13.h }[2], [x28], #0x2\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "ld1 { v21.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v12.h }[2], [x14], #0x2\n"
+ "ld1 { v23.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v14.h }[2], [x25], #0x2\n"
+ "ld1 { v15.h }[2], [x24], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v7.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[6], [x14], #0x1\n"
- "ld1 { v10.b }[6], [x13], #0x1\n"
- "ld1 { v3.b }[6], [x10], #0x1\n"
- "ld1 { v14.b }[6], [x9], #0x1\n"
- "ld1 { v13.b }[6], [x28], #0x1\n"
- "ld1 { v28.b }[6], [x27], #0x1\n"
- "ld1 { v21.b }[6], [x26], #0x1\n"
- "ld1 { v27.b }[6], [x21], #0x1\n"
+ "ld1 { v12.b }[6], [x14], #0x1\n"
+ "ld1 { v23.b }[6], [x27], #0x1\n"
+ "ld1 { v30.b }[6], [x26], #0x1\n"
+ "ld1 { v14.b }[6], [x25], #0x1\n"
+ "ld1 { v15.b }[6], [x24], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
+ "ld1 { v7.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[4], [x14], #0x1\n"
- "ld1 { v10.b }[4], [x13], #0x1\n"
- "ld1 { v3.b }[4], [x10], #0x1\n"
- "ld1 { v14.b }[4], [x9], #0x1\n"
- "ld1 { v13.b }[4], [x28], #0x1\n"
- "ld1 { v28.b }[4], [x27], #0x1\n"
- "ld1 { v21.b }[4], [x26], #0x1\n"
- "ld1 { v27.b }[4], [x21], #0x1\n"
+ "ld1 { v12.b }[4], [x14], #0x1\n"
+ "ld1 { v23.b }[4], [x27], #0x1\n"
+ "ld1 { v30.b }[4], [x26], #0x1\n"
+ "ld1 { v14.b }[4], [x25], #0x1\n"
+ "ld1 { v15.b }[4], [x24], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
+ "ld1 { v7.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h11, [x14], #0x2\n"
- "ldr h10, [x13], #0x2\n"
- "ldr h3, [x10], #0x2\n"
- "ldr h14, [x9], #0x2\n"
- "ldr h13, [x28], #0x2\n"
- "ldr h28, [x27], #0x2\n"
- "ldr h21, [x26], #0x2\n"
- "ldr h27, [x21], #0x2\n"
+ "ldr h12, [x14], #0x2\n"
+ "ldr h23, [x27], #0x2\n"
+ "ldr h30, [x26], #0x2\n"
+ "ldr h14, [x25], #0x2\n"
+ "ldr h15, [x24], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h7, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v11.b }[2], [x14], #0x1\n"
- "ld1 { v10.b }[2], [x13], #0x1\n"
- "ld1 { v3.b }[2], [x10], #0x1\n"
- "ld1 { v14.b }[2], [x9], #0x1\n"
- "ld1 { v13.b }[2], [x28], #0x1\n"
- "ld1 { v28.b }[2], [x27], #0x1\n"
- "ld1 { v21.b }[2], [x26], #0x1\n"
- "ld1 { v27.b }[2], [x21], #0x1\n"
+ "ld1 { v12.b }[2], [x14], #0x1\n"
+ "ld1 { v23.b }[2], [x27], #0x1\n"
+ "ld1 { v30.b }[2], [x26], #0x1\n"
+ "ld1 { v14.b }[2], [x25], #0x1\n"
+ "ld1 { v15.b }[2], [x24], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
+ "ld1 { v7.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b11, [x14], #0x1\n"
- "ldr b10, [x13], #0x1\n"
- "ldr b3, [x10], #0x1\n"
- "ldr b14, [x9], #0x1\n"
- "ldr b13, [x28], #0x1\n"
- "ldr b28, [x27], #0x1\n"
- "ldr b21, [x26], #0x1\n"
- "ldr b27, [x21], #0x1\n"
+ "ldr b12, [x14], #0x1\n"
+ "ldr b23, [x27], #0x1\n"
+ "ldr b30, [x26], #0x1\n"
+ "ldr b14, [x25], #0x1\n"
+ "ldr b15, [x24], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
+ "ldr b7, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
- "ldp x14, x13, [%x[inptrs], #0x40]\n"
- "ldp x10, x9, [%x[inptrs], #0x50]\n"
- "add x14, x14, x12\n"
- "add x13, x13, x12\n"
- "ldp x28, x27, [%x[inptrs], #0x60]\n"
- "ldp x26, x21, [%x[inptrs], #0x70]\n"
- "add x10, x10, x12\n"
- "add x9, x9, x12\n"
- "add x28, x28, x12\n"
- "add x27, x27, x12\n"
- "add x26, x26, x12\n"
- "add x21, x21, x12\n"
+ "ldp x14, x27, [%x[inptrs], #0x40]\n"
+ "ldp x26, x25, [%x[inptrs], #0x50]\n"
+ "ldp x24, x23, [%x[inptrs], #0x60]\n"
+ "ldp x22, x21, [%x[inptrs], #0x70]\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d5, [x14], #0x8\n"
- "ldr d29, [x13], #0x8\n"
- "ldr d0, [x10], #0x8\n"
- "ldr d7, [x9], #0x8\n"
- "ldr d16, [x28], #0x8\n"
- "ldr d30, [x27], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d1, [x21], #0x8\n"
+ "ldr d8, [x14], #0x8\n"
+ "ldr d21, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d1, [x23], #0x8\n"
+ "ldr d31, [x22], #0x8\n"
+ "ldr d2, [x21], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v5.s }[2], [x14], #0x4\n"
- "ld1 { v29.s }[2], [x13], #0x4\n"
- "ld1 { v0.s }[2], [x10], #0x4\n"
- "ld1 { v7.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x28], #0x4\n"
- "ld1 { v30.s }[2], [x27], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
+ "ld1 { v8.s }[2], [x14], #0x4\n"
+ "ld1 { v21.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v1.s }[2], [x23], #0x4\n"
+ "ld1 { v31.s }[2], [x22], #0x4\n"
+ "ld1 { v2.s }[2], [x21], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v5.h }[6], [x14], #0x2\n"
- "ld1 { v29.h }[6], [x13], #0x2\n"
- "ld1 { v0.h }[6], [x10], #0x2\n"
- "ld1 { v7.h }[6], [x9], #0x2\n"
- "ld1 { v16.h }[6], [x28], #0x2\n"
- "ld1 { v30.h }[6], [x27], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
+ "ld1 { v8.h }[6], [x14], #0x2\n"
+ "ld1 { v21.h }[6], [x27], #0x2\n"
+ "ld1 { v29.h }[6], [x26], #0x2\n"
+ "ld1 { v22.h }[6], [x25], #0x2\n"
+ "ld1 { v9.h }[6], [x24], #0x2\n"
+ "ld1 { v1.h }[6], [x23], #0x2\n"
+ "ld1 { v31.h }[6], [x22], #0x2\n"
+ "ld1 { v2.h }[6], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[14], [x14], #0x1\n"
- "ld1 { v29.b }[14], [x13], #0x1\n"
- "ld1 { v0.b }[14], [x10], #0x1\n"
- "ld1 { v7.b }[14], [x9], #0x1\n"
- "ld1 { v16.b }[14], [x28], #0x1\n"
- "ld1 { v30.b }[14], [x27], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v1.b }[14], [x21], #0x1\n"
+ "ld1 { v8.b }[14], [x14], #0x1\n"
+ "ld1 { v21.b }[14], [x27], #0x1\n"
+ "ld1 { v29.b }[14], [x26], #0x1\n"
+ "ld1 { v22.b }[14], [x25], #0x1\n"
+ "ld1 { v9.b }[14], [x24], #0x1\n"
+ "ld1 { v1.b }[14], [x23], #0x1\n"
+ "ld1 { v31.b }[14], [x22], #0x1\n"
+ "ld1 { v2.b }[14], [x21], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[12], [x14], #0x1\n"
- "ld1 { v29.b }[12], [x13], #0x1\n"
- "ld1 { v0.b }[12], [x10], #0x1\n"
- "ld1 { v7.b }[12], [x9], #0x1\n"
- "ld1 { v16.b }[12], [x28], #0x1\n"
- "ld1 { v30.b }[12], [x27], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v1.b }[12], [x21], #0x1\n"
+ "ld1 { v8.b }[12], [x14], #0x1\n"
+ "ld1 { v21.b }[12], [x27], #0x1\n"
+ "ld1 { v29.b }[12], [x26], #0x1\n"
+ "ld1 { v22.b }[12], [x25], #0x1\n"
+ "ld1 { v9.b }[12], [x24], #0x1\n"
+ "ld1 { v1.b }[12], [x23], #0x1\n"
+ "ld1 { v31.b }[12], [x22], #0x1\n"
+ "ld1 { v2.b }[12], [x21], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v5.h }[4], [x14], #0x2\n"
- "ld1 { v29.h }[4], [x13], #0x2\n"
- "ld1 { v0.h }[4], [x10], #0x2\n"
- "ld1 { v7.h }[4], [x9], #0x2\n"
- "ld1 { v16.h }[4], [x28], #0x2\n"
- "ld1 { v30.h }[4], [x27], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
+ "ld1 { v8.h }[4], [x14], #0x2\n"
+ "ld1 { v21.h }[4], [x27], #0x2\n"
+ "ld1 { v29.h }[4], [x26], #0x2\n"
+ "ld1 { v22.h }[4], [x25], #0x2\n"
+ "ld1 { v9.h }[4], [x24], #0x2\n"
+ "ld1 { v1.h }[4], [x23], #0x2\n"
+ "ld1 { v31.h }[4], [x22], #0x2\n"
+ "ld1 { v2.h }[4], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[10], [x14], #0x1\n"
- "ld1 { v29.b }[10], [x13], #0x1\n"
- "ld1 { v0.b }[10], [x10], #0x1\n"
- "ld1 { v7.b }[10], [x9], #0x1\n"
- "ld1 { v16.b }[10], [x28], #0x1\n"
- "ld1 { v30.b }[10], [x27], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v1.b }[10], [x21], #0x1\n"
+ "ld1 { v8.b }[10], [x14], #0x1\n"
+ "ld1 { v21.b }[10], [x27], #0x1\n"
+ "ld1 { v29.b }[10], [x26], #0x1\n"
+ "ld1 { v22.b }[10], [x25], #0x1\n"
+ "ld1 { v9.b }[10], [x24], #0x1\n"
+ "ld1 { v1.b }[10], [x23], #0x1\n"
+ "ld1 { v31.b }[10], [x22], #0x1\n"
+ "ld1 { v2.b }[10], [x21], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[8], [x14], #0x1\n"
- "ld1 { v29.b }[8], [x13], #0x1\n"
- "ld1 { v0.b }[8], [x10], #0x1\n"
- "ld1 { v7.b }[8], [x9], #0x1\n"
- "ld1 { v16.b }[8], [x28], #0x1\n"
- "ld1 { v30.b }[8], [x27], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v1.b }[8], [x21], #0x1\n"
+ "ld1 { v8.b }[8], [x14], #0x1\n"
+ "ld1 { v21.b }[8], [x27], #0x1\n"
+ "ld1 { v29.b }[8], [x26], #0x1\n"
+ "ld1 { v22.b }[8], [x25], #0x1\n"
+ "ld1 { v9.b }[8], [x24], #0x1\n"
+ "ld1 { v1.b }[8], [x23], #0x1\n"
+ "ld1 { v31.b }[8], [x22], #0x1\n"
+ "ld1 { v2.b }[8], [x21], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s5, [x14], #0x4\n"
- "ldr s29, [x13], #0x4\n"
- "ldr s0, [x10], #0x4\n"
- "ldr s7, [x9], #0x4\n"
- "ldr s16, [x28], #0x4\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s1, [x21], #0x4\n"
+ "ldr s8, [x14], #0x4\n"
+ "ldr s21, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s1, [x23], #0x4\n"
+ "ldr s31, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v5.h }[2], [x14], #0x2\n"
- "ld1 { v29.h }[2], [x13], #0x2\n"
- "ld1 { v0.h }[2], [x10], #0x2\n"
- "ld1 { v7.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
+ "ld1 { v8.h }[2], [x14], #0x2\n"
+ "ld1 { v21.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v22.h }[2], [x25], #0x2\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "ld1 { v1.h }[2], [x23], #0x2\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "ld1 { v2.h }[2], [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[6], [x14], #0x1\n"
- "ld1 { v29.b }[6], [x13], #0x1\n"
- "ld1 { v0.b }[6], [x10], #0x1\n"
- "ld1 { v7.b }[6], [x9], #0x1\n"
- "ld1 { v16.b }[6], [x28], #0x1\n"
- "ld1 { v30.b }[6], [x27], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v1.b }[6], [x21], #0x1\n"
+ "ld1 { v8.b }[6], [x14], #0x1\n"
+ "ld1 { v21.b }[6], [x27], #0x1\n"
+ "ld1 { v29.b }[6], [x26], #0x1\n"
+ "ld1 { v22.b }[6], [x25], #0x1\n"
+ "ld1 { v9.b }[6], [x24], #0x1\n"
+ "ld1 { v1.b }[6], [x23], #0x1\n"
+ "ld1 { v31.b }[6], [x22], #0x1\n"
+ "ld1 { v2.b }[6], [x21], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[4], [x14], #0x1\n"
- "ld1 { v29.b }[4], [x13], #0x1\n"
- "ld1 { v0.b }[4], [x10], #0x1\n"
- "ld1 { v7.b }[4], [x9], #0x1\n"
- "ld1 { v16.b }[4], [x28], #0x1\n"
- "ld1 { v30.b }[4], [x27], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v1.b }[4], [x21], #0x1\n"
+ "ld1 { v8.b }[4], [x14], #0x1\n"
+ "ld1 { v21.b }[4], [x27], #0x1\n"
+ "ld1 { v29.b }[4], [x26], #0x1\n"
+ "ld1 { v22.b }[4], [x25], #0x1\n"
+ "ld1 { v9.b }[4], [x24], #0x1\n"
+ "ld1 { v1.b }[4], [x23], #0x1\n"
+ "ld1 { v31.b }[4], [x22], #0x1\n"
+ "ld1 { v2.b }[4], [x21], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h5, [x14], #0x2\n"
- "ldr h29, [x13], #0x2\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h7, [x9], #0x2\n"
- "ldr h16, [x28], #0x2\n"
- "ldr h30, [x27], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h1, [x21], #0x2\n"
+ "ldr h8, [x14], #0x2\n"
+ "ldr h21, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "ldr h22, [x25], #0x2\n"
+ "ldr h9, [x24], #0x2\n"
+ "ldr h1, [x23], #0x2\n"
+ "ldr h31, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v5.b }[2], [x14], #0x1\n"
- "ld1 { v29.b }[2], [x13], #0x1\n"
- "ld1 { v0.b }[2], [x10], #0x1\n"
- "ld1 { v7.b }[2], [x9], #0x1\n"
- "ld1 { v16.b }[2], [x28], #0x1\n"
- "ld1 { v30.b }[2], [x27], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v1.b }[2], [x21], #0x1\n"
+ "ld1 { v8.b }[2], [x14], #0x1\n"
+ "ld1 { v21.b }[2], [x27], #0x1\n"
+ "ld1 { v29.b }[2], [x26], #0x1\n"
+ "ld1 { v22.b }[2], [x25], #0x1\n"
+ "ld1 { v9.b }[2], [x24], #0x1\n"
+ "ld1 { v1.b }[2], [x23], #0x1\n"
+ "ld1 { v31.b }[2], [x22], #0x1\n"
+ "ld1 { v2.b }[2], [x21], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b5, [x14], #0x1\n"
- "ldr b29, [x13], #0x1\n"
- "ldr b0, [x10], #0x1\n"
- "ldr b7, [x9], #0x1\n"
- "ldr b16, [x28], #0x1\n"
- "ldr b30, [x27], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b1, [x21], #0x1\n"
+ "ldr b8, [x14], #0x1\n"
+ "ldr b21, [x27], #0x1\n"
+ "ldr b29, [x26], #0x1\n"
+ "ldr b22, [x25], #0x1\n"
+ "ldr b9, [x24], #0x1\n"
+ "ldr b1, [x23], #0x1\n"
+ "ldr b31, [x22], #0x1\n"
+ "ldr b2, [x21], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
- "ldr q25, [%x[params], #0x10]\n"
- "ldr q24, [%x[params], #0x20]\n"
- "zip2 v18.16b, v11.16b, v3.16b\n"
- "zip1 v11.16b, v11.16b, v3.16b\n"
- "ldr q23, [%x[params], #0x30]\n"
- "zip1 v17.16b, v10.16b, v14.16b\n"
- "zip2 v14.16b, v10.16b, v14.16b\n"
+ "ldr q10, [%x[params], #0x10]\n"
+ "ldr q4, [%x[params], #0x20]\n"
+ "zip2 v17.16b, v12.16b, v30.16b\n"
+ "zip1 v12.16b, v12.16b, v30.16b\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "zip1 v16.16b, v23.16b, v14.16b\n"
+ "zip2 v14.16b, v23.16b, v14.16b\n"
"cmp x20, #0x4\n"
- "zip2 v10.16b, v11.16b, v17.16b\n"
- "zip1 v11.16b, v11.16b, v17.16b\n"
- "zip1 v3.16b, v18.16b, v14.16b\n"
- "zip2 v14.16b, v18.16b, v14.16b\n"
- "ldr q31, [%x[params], #0x0]\n"
- "zip2 v22.16b, v13.16b, v21.16b\n"
- "zip1 v13.16b, v13.16b, v21.16b\n"
- "zip1 v21.16b, v28.16b, v27.16b\n"
- "zip2 v27.16b, v28.16b, v27.16b\n"
- "zip2 v20.16b, v5.16b, v0.16b\n"
- "zip1 v5.16b, v5.16b, v0.16b\n"
- "zip1 v19.16b, v29.16b, v7.16b\n"
- "zip2 v7.16b, v29.16b, v7.16b\n"
- "zip2 v18.16b, v16.16b, v2.16b\n"
- "zip1 v16.16b, v16.16b, v2.16b\n"
- "zip1 v17.16b, v30.16b, v1.16b\n"
- "zip2 v1.16b, v30.16b, v1.16b\n"
- "zip2 v28.16b, v13.16b, v21.16b\n"
- "zip1 v13.16b, v13.16b, v21.16b\n"
- "zip1 v21.16b, v22.16b, v27.16b\n"
- "zip2 v27.16b, v22.16b, v27.16b\n"
- "zip2 v29.16b, v5.16b, v19.16b\n"
- "zip1 v5.16b, v5.16b, v19.16b\n"
- "zip1 v0.16b, v20.16b, v7.16b\n"
- "zip2 v7.16b, v20.16b, v7.16b\n"
- "zip2 v30.16b, v16.16b, v17.16b\n"
- "zip1 v16.16b, v16.16b, v17.16b\n"
- "zip1 v2.16b, v18.16b, v1.16b\n"
- "zip2 v1.16b, v18.16b, v1.16b\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- ".inst 0x4e8d9732 // sdot v18.4s, v25.16b, v13.16b\n"
- "mov v4.16b, v31.16b\n"
- ".inst 0x4e8b973f // sdot v31.4s, v25.16b, v11.16b\n"
- ".inst 0x4e8d971f // sdot v31.4s, v24.16b, v13.16b\n"
- "ext v11.16b, v11.16b, v11.16b, #0x1\n"
- "ext v13.16b, v13.16b, v13.16b, #0x1\n"
- ".inst 0x4e8b973a // sdot v26.4s, v25.16b, v11.16b\n"
+ "zip2 v24.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v20.16b, v5.16b, v28.16b\n"
+ "zip2 v28.16b, v5.16b, v28.16b\n"
+ "zip2 v23.16b, v12.16b, v16.16b\n"
+ "zip1 v12.16b, v12.16b, v16.16b\n"
+ "zip1 v30.16b, v17.16b, v14.16b\n"
+ "zip2 v14.16b, v17.16b, v14.16b\n"
+ "ldr q3, [%x[params], #0x0]\n"
+ "zip2 v19.16b, v8.16b, v29.16b\n"
+ "zip1 v8.16b, v8.16b, v29.16b\n"
+ "zip1 v18.16b, v21.16b, v22.16b\n"
+ "zip2 v22.16b, v21.16b, v22.16b\n"
+ "zip2 v17.16b, v9.16b, v31.16b\n"
+ "zip1 v9.16b, v9.16b, v31.16b\n"
+ "zip1 v16.16b, v1.16b, v2.16b\n"
+ "zip2 v2.16b, v1.16b, v2.16b\n"
+ "zip2 v5.16b, v15.16b, v20.16b\n"
+ "zip1 v15.16b, v15.16b, v20.16b\n"
+ "zip1 v7.16b, v24.16b, v28.16b\n"
+ "zip2 v28.16b, v24.16b, v28.16b\n"
+ "zip2 v21.16b, v8.16b, v18.16b\n"
+ "zip1 v8.16b, v8.16b, v18.16b\n"
+ "zip1 v29.16b, v19.16b, v22.16b\n"
+ "zip2 v22.16b, v19.16b, v22.16b\n"
+ "zip2 v1.16b, v9.16b, v16.16b\n"
+ "zip1 v9.16b, v9.16b, v16.16b\n"
+ "zip1 v31.16b, v17.16b, v2.16b\n"
+ "zip2 v2.16b, v17.16b, v2.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
+ ".inst 0x4e8c9543 // sdot v3.4s, v10.16b, v12.16b\n"
+ ".inst 0x4e8f9540 // sdot v0.4s, v10.16b, v15.16b\n"
+ "ext v12.16b, v12.16b, v12.16b, #0x1\n"
+ ".inst 0x4e8f9483 // sdot v3.4s, v4.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e8c955a // sdot v26.4s, v10.16b, v12.16b\n"
"ldr q17, [%x[params], #0x40]\n"
- ".inst 0x4e8d9724 // sdot v4.4s, v25.16b, v13.16b\n"
- ".inst 0x4e859712 // sdot v18.4s, v24.16b, v5.16b\n"
- ".inst 0x4e8596ff // sdot v31.4s, v23.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x4e8d971a // sdot v26.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e889480 // sdot v0.4s, v4.16b, v8.16b\n"
+ ".inst 0x4e8f9552 // sdot v18.4s, v10.16b, v15.16b\n"
+ ".inst 0x4e889723 // sdot v3.4s, v25.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8f949a // sdot v26.4s, v4.16b, v15.16b\n"
"ldr q20, [%x[params], #0x50]\n"
- ".inst 0x4e859704 // sdot v4.4s, v24.16b, v5.16b\n"
- ".inst 0x4e9096f2 // sdot v18.4s, v23.16b, v16.16b\n"
- "ext v16.16b, v16.16b, v16.16b, #0x1\n"
"add %x[params], %x[params], #0x60\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e8596fa // sdot v26.4s, v23.16b, v5.16b\n"
- ".inst 0x4e9096e4 // sdot v4.4s, v23.16b, v16.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ ".inst 0x4e899720 // sdot v0.4s, v25.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e889492 // sdot v18.4s, v4.16b, v8.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ ".inst 0x4e88973a // sdot v26.4s, v25.16b, v8.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+ ".inst 0x4e899732 // sdot v18.4s, v25.16b, v9.16b\n"
+ "and v16.16b, v3.16b, v20.16b\n"
"sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "and v19.16b, v0.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v19.16b, v26.16b, v20.16b\n"
- "and v17.16b, v18.16b, v20.16b\n"
- "and v16.16b, v4.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
+ "and v16.16b, v18.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v19.4s\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v19.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "srshl v0.4s, v0.4s, v20.4s\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
"srshl v18.4s, v18.4s, v20.4s\n"
- "srshl v4.4s, v4.4s, v20.4s\n"
- "add v31.4s, v31.4s, v15.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 20f\n"
- "str s31, [x25, x11]\n"
- "str s26, [x24, x11]\n"
- "str s18, [x23, x11]\n"
- "str s4, [x22, x11]\n"
+ "str s3, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s0, [x9, x12]\n"
+ "str s18, [x28, x12]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 21f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v26.h }[0], [x24], #0x2\n"
- "st1 { v18.h }[0], [x23], #0x2\n"
- "st1 { v4.h }[0], [x22], #0x2\n"
+ "st1 { v3.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v0.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v18.b }[2], [x23], #0x1\n"
- "st1 { v4.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v0.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v18.b }[0], [x23], #0x1\n"
- "st1 { v4.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v0.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
- "add x11, x11, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
- "ldr q31, [%x[params], #0x0]\n"
- "ldr q23, [%x[params], #0x10]\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- "ldr q22, [%x[params], #0x20]\n"
+ "ldr q3, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "cmp x20, #0x4\n"
+ "ldr q24, [%x[params], #0x20]\n"
"ldr q16, [%x[params], #0x30]\n"
- "mov v4.16b, v31.16b\n"
- ".inst 0x4e8a96ff // sdot v31.4s, v23.16b, v10.16b\n"
"ldr q17, [%x[params], #0x40]\n"
"ldr q20, [%x[params], #0x50]\n"
- ".inst 0x4e9c96f2 // sdot v18.4s, v23.16b, v28.16b\n"
- ".inst 0x4e9c96df // sdot v31.4s, v22.16b, v28.16b\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "ext v28.16b, v28.16b, v28.16b, #0x1\n"
- ".inst 0x4e8a96fa // sdot v26.4s, v23.16b, v10.16b\n"
- "cmp x20, #0x4\n"
- ".inst 0x4e9c96e4 // sdot v4.4s, v23.16b, v28.16b\n"
- ".inst 0x4e9d96d2 // sdot v18.4s, v22.16b, v29.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e9d961f // sdot v31.4s, v16.16b, v29.16b\n"
- "ext v29.16b, v29.16b, v29.16b, #0x1\n"
- ".inst 0x4e9c96da // sdot v26.4s, v22.16b, v28.16b\n"
- ".inst 0x4e9d96c4 // sdot v4.4s, v22.16b, v29.16b\n"
- ".inst 0x4e9e9612 // sdot v18.4s, v16.16b, v30.16b\n"
- "ext v30.16b, v30.16b, v30.16b, #0x1\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
- ".inst 0x4e9e9604 // sdot v4.4s, v16.16b, v30.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
+ ".inst 0x4e979723 // sdot v3.4s, v25.16b, v23.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ ".inst 0x4e859720 // sdot v0.4s, v25.16b, v5.16b\n"
+ ".inst 0x4e859703 // sdot v3.4s, v24.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e97973a // sdot v26.4s, v25.16b, v23.16b\n"
+ ".inst 0x4e859732 // sdot v18.4s, v25.16b, v5.16b\n"
+ ".inst 0x4e959700 // sdot v0.4s, v24.16b, v21.16b\n"
+ ".inst 0x4e959603 // sdot v3.4s, v16.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e85971a // sdot v26.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e959712 // sdot v18.4s, v24.16b, v21.16b\n"
+ ".inst 0x4e819600 // sdot v0.4s, v16.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ ".inst 0x4e95961a // sdot v26.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n"
+ "and v16.16b, v3.16b, v20.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v19.16b, v26.16b, v20.16b\n"
- "and v17.16b, v18.16b, v20.16b\n"
- "and v16.16b, v4.16b, v20.16b\n"
+ "and v19.16b, v0.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v18.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v19.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v0.4s, v0.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
"srshl v18.4s, v18.4s, v20.4s\n"
- "srshl v4.4s, v4.4s, v20.4s\n"
- "add v31.4s, v31.4s, v15.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 24f\n"
- "str s31, [x25, x11]\n"
- "str s26, [x24, x11]\n"
- "str s18, [x23, x11]\n"
- "str s4, [x22, x11]\n"
+ "str s3, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s0, [x9, x12]\n"
+ "str s18, [x28, x12]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 25f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v26.h }[0], [x24], #0x2\n"
- "st1 { v18.h }[0], [x23], #0x2\n"
- "st1 { v4.h }[0], [x22], #0x2\n"
+ "st1 { v3.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v0.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v18.b }[2], [x23], #0x1\n"
- "st1 { v4.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v0.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v18.b }[0], [x23], #0x1\n"
- "st1 { v4.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v0.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
- "add x11, x11, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q3, [%x[params], #0x0]\n"
"ldr q23, [%x[params], #0x10]\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- "ldr q22, [%x[params], #0x20]\n"
+ "cmp x20, #0x4\n"
+ "ldr q21, [%x[params], #0x20]\n"
"ldr q16, [%x[params], #0x30]\n"
- "mov v4.16b, v31.16b\n"
- ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
"ldr q17, [%x[params], #0x40]\n"
"ldr q20, [%x[params], #0x50]\n"
- ".inst 0x4e9596f2 // sdot v18.4s, v23.16b, v21.16b\n"
- ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x4e8396fa // sdot v26.4s, v23.16b, v3.16b\n"
- "cmp x20, #0x4\n"
- ".inst 0x4e9596e4 // sdot v4.4s, v23.16b, v21.16b\n"
- ".inst 0x4e8096d2 // sdot v18.4s, v22.16b, v0.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x4e80961f // sdot v31.4s, v16.16b, v0.16b\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n"
- ".inst 0x4e8096c4 // sdot v4.4s, v22.16b, v0.16b\n"
- ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e80961a // sdot v26.4s, v16.16b, v0.16b\n"
- ".inst 0x4e829604 // sdot v4.4s, v16.16b, v2.16b\n"
- "and v16.16b, v31.16b, v20.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
+ ".inst 0x4e9e96e3 // sdot v3.4s, v23.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e8796e0 // sdot v0.4s, v23.16b, v7.16b\n"
+ ".inst 0x4e8796a3 // sdot v3.4s, v21.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9e96fa // sdot v26.4s, v23.16b, v30.16b\n"
+ ".inst 0x4e8796f2 // sdot v18.4s, v23.16b, v7.16b\n"
+ ".inst 0x4e9d96a0 // sdot v0.4s, v21.16b, v29.16b\n"
+ ".inst 0x4e9d9603 // sdot v3.4s, v16.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e8796ba // sdot v26.4s, v21.16b, v7.16b\n"
+ ".inst 0x4e9d96b2 // sdot v18.4s, v21.16b, v29.16b\n"
+ ".inst 0x4e9f9600 // sdot v0.4s, v16.16b, v31.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x1\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9f9612 // sdot v18.4s, v16.16b, v31.16b\n"
+ "and v16.16b, v3.16b, v20.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v19.16b, v26.16b, v20.16b\n"
- "and v17.16b, v18.16b, v20.16b\n"
- "and v16.16b, v4.16b, v20.16b\n"
+ "and v19.16b, v0.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v18.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v19.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v0.4s, v0.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
"srshl v18.4s, v18.4s, v20.4s\n"
- "srshl v4.4s, v4.4s, v20.4s\n"
- "add v31.4s, v31.4s, v15.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
"blt 28f\n"
- "str s31, [x25, x11]\n"
- "str s26, [x24, x11]\n"
- "str s18, [x23, x11]\n"
- "str s4, [x22, x11]\n"
+ "str s3, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s0, [x9, x12]\n"
+ "str s18, [x28, x12]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 29f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v26.h }[0], [x24], #0x2\n"
- "st1 { v18.h }[0], [x23], #0x2\n"
- "st1 { v4.h }[0], [x22], #0x2\n"
+ "st1 { v3.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v0.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v18.b }[2], [x23], #0x1\n"
- "st1 { v4.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v0.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v18.b }[0], [x23], #0x1\n"
- "st1 { v4.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v0.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
- "add x11, x11, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
- "ldr q31, [%x[params], #0x0]\n"
+ "ldr q3, [%x[params], #0x0]\n"
"ldr q20, [%x[params], #0x10]\n"
- "mov v26.16b, v31.16b\n"
- "mov v18.16b, v31.16b\n"
- "ldr q19, [%x[params], #0x20]\n"
+ "ldr q17, [%x[params], #0x20]\n"
"ldr q16, [%x[params], #0x30]\n"
- "mov v4.16b, v31.16b\n"
- ".inst 0x4e8e969f // sdot v31.4s, v20.16b, v14.16b\n"
- "ldr q17, [%x[params], #0x40]\n"
- "ldr q22, [%x[params], #0x50]\n"
- ".inst 0x4e9b9692 // sdot v18.4s, v20.16b, v27.16b\n"
- ".inst 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n"
+ "ldr q1, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mov v26.16b, v3.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "mov v18.16b, v3.16b\n"
+ ".inst 0x4e8e9683 // sdot v3.4s, v20.16b, v14.16b\n"
"ext v14.16b, v14.16b, v14.16b, #0x1\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9c9680 // sdot v0.4s, v20.16b, v28.16b\n"
+ ".inst 0x4e9c9623 // sdot v3.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
".inst 0x4e8e969a // sdot v26.4s, v20.16b, v14.16b\n"
- "add %x[params], %x[params], #0x60\n"
- ".inst 0x4e9b9684 // sdot v4.4s, v20.16b, v27.16b\n"
- ".inst 0x4e879672 // sdot v18.4s, v19.16b, v7.16b\n"
- ".inst 0x4e87961f // sdot v31.4s, v16.16b, v7.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x4e9b967a // sdot v26.4s, v19.16b, v27.16b\n"
- ".inst 0x4e879664 // sdot v4.4s, v19.16b, v7.16b\n"
- ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n"
- "ext v1.16b, v1.16b, v1.16b, #0x1\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x4e87961a // sdot v26.4s, v16.16b, v7.16b\n"
- ".inst 0x4e819604 // sdot v4.4s, v16.16b, v1.16b\n"
- "and v16.16b, v31.16b, v22.16b\n"
+ ".inst 0x4e9c9692 // sdot v18.4s, v20.16b, v28.16b\n"
+ ".inst 0x4e969620 // sdot v0.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e969603 // sdot v3.4s, v16.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e9c963a // sdot v26.4s, v17.16b, v28.16b\n"
+ ".inst 0x4e969632 // sdot v18.4s, v17.16b, v22.16b\n"
+ ".inst 0x4e829600 // sdot v0.4s, v16.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v3.4s, v3.4s, v1.4s\n"
+ ".inst 0x4e96961a // sdot v26.4s, v16.16b, v22.16b\n"
+ ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n"
+ "and v16.16b, v3.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v1.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v23.16b, v26.16b, v22.16b\n"
- "and v17.16b, v18.16b, v22.16b\n"
- "and v16.16b, v4.16b, v22.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v1.4s\n"
+ "and v29.16b, v0.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v19.16b\n"
+ "and v16.16b, v18.16b, v19.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v26.4s, v26.4s, v23.4s\n"
- "sqadd v18.4s, v18.4s, v17.4s\n"
- "sqadd v4.4s, v4.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v22.4s\n"
- "srshl v26.4s, v26.4s, v22.4s\n"
- "srshl v18.4s, v18.4s, v22.4s\n"
- "srshl v4.4s, v4.4s, v22.4s\n"
- "add v31.4s, v31.4s, v15.4s\n"
- "add v26.4s, v26.4s, v15.4s\n"
- "add v18.4s, v18.4s, v15.4s\n"
- "add v4.4s, v4.4s, v15.4s\n"
- "smax v31.4s, v31.4s, v8.4s\n"
- "smax v26.4s, v26.4s, v8.4s\n"
- "smax v18.4s, v18.4s, v8.4s\n"
- "smax v4.4s, v4.4s, v8.4s\n"
- "smin v31.4s, v31.4s, v12.4s\n"
- "smin v26.4s, v26.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smin v4.4s, v4.4s, v12.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "sqadd v0.4s, v0.4s, v29.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v13.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v19.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v27.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
+ "add v0.4s, v0.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v13.4s\n"
+ "smax v0.4s, v0.4s, v27.4s\n"
+ "smax v26.4s, v26.4s, v27.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smax v18.4s, v18.4s, v27.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
- "add x25, x25, x11\n"
- "add x24, x24, x11\n"
- "add x23, x23, x11\n"
- "add x22, x22, x11\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 33f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v26.h }[0], [x24], #0x2\n"
- "st1 { v18.h }[0], [x23], #0x2\n"
- "st1 { v4.h }[0], [x22], #0x2\n"
+ "st1 { v3.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v0.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v26.b }[2], [x24], #0x1\n"
- "st1 { v18.b }[2], [x23], #0x1\n"
- "st1 { v4.b }[2], [x22], #0x1\n"
+ "st1 { v3.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v0.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v26.b }[0], [x24], #0x1\n"
- "st1 { v18.b }[0], [x23], #0x1\n"
- "st1 { v4.b }[0], [x22], #0x1\n"
+ "st1 { v3.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v0.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 5a28daffbf..ceb2693550 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,1622 +33,1622 @@ namespace depthwise {
void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x20, #0x1\n"
- "orr x20, x20, #0x100\n"
+ "mov x17, #0x1\n"
+ "lsr x16, %x[n_channels], #0x4\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "orr x20, x20, #0x10000\n"
- "lsr x11, %x[n_channels], #0x4\n"
- "dup v12.4s, w20\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_minval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "ld1r { v7.4s }, [x21]\n"
"ld1r { v11.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "mov x28, #0x0\n"
- "mov x27, #0x0\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "ldp x25, x24, [%x[outptrs], #0x0]\n"
- "ldp x23, x22, [%x[outptrs], #0x10]\n"
- "cbz x11, 3f\n"
- "ldr q15, [x15, x28]\n"
- "ldr q28, [x14, x28]\n"
- "subs x11, x11, #0x1\n"
- "ldr q30, [x13, x28]\n"
- "ldr q8, [x12, x28]\n"
- "zip2 v19.16b, v15.16b, v30.16b\n"
- "zip1 v15.16b, v15.16b, v30.16b\n"
- "ldr q26, [x10, x28]\n"
- "ldr q0, [x9, x28]\n"
- "zip1 v7.16b, v28.16b, v8.16b\n"
- "zip2 v8.16b, v28.16b, v8.16b\n"
- "ldr q29, [x26, x28]\n"
- "ldr q10, [x21, x28]\n"
- "zip2 v25.16b, v15.16b, v7.16b\n"
- "zip1 v15.16b, v15.16b, v7.16b\n"
- "ldr q1, [%x[params], #0x10]\n"
- "ldr q6, [%x[params], #0x20]\n"
- "zip1 v7.16b, v19.16b, v8.16b\n"
- "zip2 v8.16b, v19.16b, v8.16b\n"
- "ldr q31, [%x[params], #0x0]\n"
- "ldr q20, [%x[params], #0x30]\n"
- "zip2 v21.16b, v26.16b, v29.16b\n"
- "zip1 v26.16b, v26.16b, v29.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q22, [x21, x28]\n"
- "zip1 v27.16b, v0.16b, v10.16b\n"
- "zip2 v10.16b, v0.16b, v10.16b\n"
- "ldr q17, [x20, x28]\n"
- "ldp x21, x20, [%x[inptrs], #0x50]\n"
- "zip2 v23.16b, v26.16b, v27.16b\n"
- "zip1 v26.16b, v26.16b, v27.16b\n"
- "ldr q9, [x21, x28]\n"
- "ldr q5, [x20, x28]\n"
- "zip2 v28.16b, v22.16b, v9.16b\n"
- "zip1 v22.16b, v22.16b, v9.16b\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q27, [x21, x28]\n"
- "zip1 v24.16b, v17.16b, v5.16b\n"
- "zip2 v5.16b, v17.16b, v5.16b\n"
- "ldr q18, [x20, x28]\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "orr x17, x17, #0x100\n"
+ "mov x13, #0x0\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "orr x17, x17, #0x10000\n"
+ "dup v15.4s, w17\n"
+ "cbz x16, 3f\n"
+ "ldr q13, [x15, x13]\n"
+ "ldr q5, [x14, x13]\n"
+ "subs x16, x16, #0x1\n"
+ "ldr q27, [x27, x13]\n"
+ "ldr q9, [x26, x13]\n"
+ "ldr q1, [x25, x13]\n"
+ "ldr q28, [x24, x13]\n"
+ "ldr q26, [x23, x13]\n"
+ "ldr q4, [x22, x13]\n"
+ "ldr q30, [%x[params], #0x10]\n"
+ "ldr q8, [%x[params], #0x20]\n"
+ "zip2 v19.16b, v13.16b, v27.16b\n"
+ "zip1 v13.16b, v13.16b, v27.16b\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "zip1 v3.16b, v5.16b, v9.16b\n"
+ "zip2 v9.16b, v5.16b, v9.16b\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "zip2 v18.16b, v1.16b, v26.16b\n"
+ "zip1 v1.16b, v1.16b, v26.16b\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip1 v3.16b, v21.16b, v10.16b\n"
- "zip2 v10.16b, v21.16b, v10.16b\n"
- "ldr q4, [x21, x28]\n"
- "ldr q9, [x20, x28]\n"
- "zip2 v17.16b, v27.16b, v4.16b\n"
- "zip1 v27.16b, v27.16b, v4.16b\n"
- "zip1 v4.16b, v18.16b, v9.16b\n"
- "zip2 v9.16b, v18.16b, v9.16b\n"
+ "zip1 v16.16b, v28.16b, v4.16b\n"
+ "zip2 v4.16b, v28.16b, v4.16b\n"
+ "ldr q10, [x27, x13]\n"
+ "ldr q14, [x26, x13]\n"
+ "zip2 v2.16b, v13.16b, v3.16b\n"
+ "zip1 v13.16b, v13.16b, v3.16b\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "zip2 v19.16b, v22.16b, v24.16b\n"
- "zip1 v22.16b, v22.16b, v24.16b\n"
- "zip1 v0.16b, v28.16b, v5.16b\n"
- "zip2 v5.16b, v28.16b, v5.16b\n"
+ "ldr q3, [x25, x13]\n"
+ "ldr q6, [x24, x13]\n"
+ "zip1 v0.16b, v19.16b, v9.16b\n"
+ "zip2 v9.16b, v19.16b, v9.16b\n"
+ "ldr q5, [x23, x13]\n"
+ "ldr q20, [x22, x13]\n"
+ "zip2 v21.16b, v1.16b, v16.16b\n"
+ "zip1 v1.16b, v1.16b, v16.16b\n"
+ "ldr q16, [x21, x13]\n"
+ "ldr q25, [x20, x13]\n"
+ "zip1 v28.16b, v18.16b, v4.16b\n"
+ "zip2 v4.16b, v18.16b, v4.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v19.16b, v10.16b, v3.16b\n"
+ "zip1 v10.16b, v10.16b, v3.16b\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "zip1 v18.16b, v14.16b, v6.16b\n"
+ "zip2 v6.16b, v14.16b, v6.16b\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "zip2 v23.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
"add %x[params], %x[params], #0x40\n"
- "zip2 v24.16b, v27.16b, v4.16b\n"
- "zip1 v27.16b, v27.16b, v4.16b\n"
- "zip1 v2.16b, v17.16b, v9.16b\n"
- "zip2 v9.16b, v17.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
+ "zip1 v16.16b, v20.16b, v25.16b\n"
+ "zip2 v25.16b, v20.16b, v25.16b\n"
+ "zip2 v29.16b, v10.16b, v18.16b\n"
+ "zip1 v10.16b, v10.16b, v18.16b\n"
+ "zip1 v27.16b, v19.16b, v6.16b\n"
+ "zip2 v6.16b, v19.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v14.16b, v23.16b, v25.16b\n"
+ "zip2 v25.16b, v23.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
"beq 2f\n"
"1:" // Loop
- "movi v21.4s, #0x0\n"
- ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
- ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
- "add x28, x28, #0x10\n"
- ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
- ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
- "movi v18.4s, #0x0\n"
- "subs x11, x11, #0x1\n"
- ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
- ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
- ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
- ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
- "mls v31.4s, v21.4s, v16.4s\n"
- ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
- ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
- "ldr q26, [%x[params], #0x10]\n"
- ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
- ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
- "ldr q17, [%x[params], #0x0]\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
- ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v21.4s, v16.4s\n"
- "and v15.16b, v31.16b, v26.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v17.4s\n"
- "sqrdmulh v29.4s, v29.4s, v17.4s\n"
- "sqrdmulh v28.4s, v28.4s, v17.4s\n"
- "ldr q1, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v15.4s\n"
- "and v18.16b, v30.16b, v26.16b\n"
- "and v21.16b, v29.16b, v26.16b\n"
- "and v17.16b, v28.16b, v26.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8d97df // udot v31.4s, v30.16b, v13.16b\n"
+ ".inst 0x6e8197c3 // udot v3.4s, v30.16b, v1.16b\n"
+ "add x13, x13, #0x10\n"
+ "movi v22.4s, #0x0\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0x6e8195f3 // udot v19.4s, v15.16b, v1.16b\n"
+ ".inst 0x6e81951f // udot v31.4s, v8.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x6e8a9503 // udot v3.4s, v8.16b, v10.16b\n"
+ ".inst 0x6e8a95f3 // udot v19.4s, v15.16b, v10.16b\n"
+ ".inst 0x6e8195f6 // udot v22.4s, v15.16b, v1.16b\n"
+ ".inst 0x6e8a963f // udot v31.4s, v17.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e8197d7 // udot v23.4s, v30.16b, v1.16b\n"
+ "mov v16.16b, v19.16b\n .inst 0x6e8595f0 // udot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95f3 // udot v19.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6e859623 // udot v3.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a95f6 // udot v22.4s, v15.16b, v10.16b\n"
+ ".inst 0x6e8d97da // udot v26.4s, v30.16b, v13.16b\n"
+ ".inst 0x6e8a9517 // udot v23.4s, v8.16b, v10.16b\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v19.4s, #0x0\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x6e81951a // udot v26.4s, v8.16b, v1.16b\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "mov v16.16b, v22.16b\n .inst 0x6e8595f0 // udot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95f6 // udot v22.4s, v15.16b, v13.16b\n"
+ "ldr q1, [%x[params], #0x0]\n"
+ ".inst 0x6e9595f3 // udot v19.4s, v15.16b, v21.16b\n"
+ ".inst 0x6e859637 // udot v23.4s, v17.16b, v5.16b\n"
+ ".inst 0x6e8a963a // udot v26.4s, v17.16b, v10.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v1.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v1.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v8.16b\n"
+ ".inst 0x6e9d95f3 // udot v19.4s, v15.16b, v29.16b\n"
+ "mls v26.4s, v22.4s, v24.4s\n"
+ "movi v20.4s, #0x0\n"
+ "sqrdmulh v23.4s, v23.4s, v1.4s\n"
+ "and v30.16b, v3.16b, v8.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "ldr q10, [%x[params], #0x60]\n"
+ "mov v22.16b, v19.16b\n .inst 0x6e9295f6 // udot v22.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295f3 // udot v19.4s, v15.16b, v2.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v8.16b\n"
+ "and v16.16b, v26.16b, v8.16b\n"
+ "sqadd v3.4s, v3.4s, v30.4s\n"
+ "ldr q5, [%x[params], #0x50]\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v26.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "ldr q18, [%x[params], #0x40]\n"
- "sqadd v29.4s, v29.4s, v21.4s\n"
- "ldr q27, [%x[params], #0x50]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v26.4s\n"
- "srshl v29.4s, v29.4s, v26.4s\n"
- "srshl v28.4s, v28.4s, v26.4s\n"
- "ldr q20, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v8.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "ldr q30, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "ldr q1, [%x[params], #0x70]\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x6e979596 // udot v22.4s, v12.16b, v23.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q26, [%x[params], #0x20]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- ".inst 0x6e939596 // udot v22.4s, v12.16b, v19.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- "mov v6.16b, v22.16b\n .inst 0x6e989586 // udot v6.4s, v12.16b, v24.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s29, [x23, x27]\n"
- "mov v30.16b, v26.16b\n"
- ".inst 0x6e999596 // udot v22.4s, v12.16b, v25.16b\n"
- "str s28, [x22, x27]\n"
- "mov v29.16b, v26.16b\n"
- "mov v21.16b, v26.16b\n"
- ".inst 0x6e9995fa // udot v26.4s, v15.16b, v25.16b\n"
- ".inst 0x6e9795fd // udot v29.4s, v15.16b, v23.16b\n"
- ".inst 0x6e97965a // udot v26.4s, v18.16b, v23.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- "movi v28.4s, #0x0\n"
- ".inst 0x6e9995fe // udot v30.4s, v15.16b, v25.16b\n"
- ".inst 0x6e9795f5 // udot v21.4s, v15.16b, v23.16b\n"
- ".inst 0x6e97959c // udot v28.4s, v12.16b, v23.16b\n"
- ".inst 0x6e93965d // udot v29.4s, v18.16b, v19.16b\n"
- ".inst 0x6e93977a // udot v26.4s, v27.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6e97965e // udot v30.4s, v18.16b, v23.16b\n"
- "ldr q4, [x9, x28]\n"
- ".inst 0x6e939655 // udot v21.4s, v18.16b, v19.16b\n"
- "mls v26.4s, v22.4s, v16.4s\n"
- ".inst 0x6e93959c // udot v28.4s, v12.16b, v19.16b\n"
- ".inst 0x6e98977d // udot v29.4s, v27.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e93977e // udot v30.4s, v27.16b, v19.16b\n"
- ".inst 0x6e989775 // udot v21.4s, v27.16b, v24.16b\n"
- "sqrdmulh v26.4s, v26.4s, v1.4s\n"
- "mov v17.16b, v28.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x6e99959c // udot v28.4s, v12.16b, v25.16b\n"
- "ldr q31, [x14, x28]\n"
- "mls v30.4s, v28.4s, v16.4s\n"
- "mls v29.4s, v6.4s, v16.4s\n"
- "mls v21.4s, v17.4s, v16.4s\n"
- "and v17.16b, v26.16b, v20.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v1.4s\n"
- "sqrdmulh v29.4s, v29.4s, v1.4s\n"
- "sqrdmulh v21.4s, v21.4s, v1.4s\n"
- "ldr q27, [%x[params], #0xc0]\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "and v18.16b, v30.16b, v20.16b\n"
- "and v6.16b, v29.16b, v20.16b\n"
- "and v17.16b, v21.16b, v20.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "ldr q28, [%x[params], #0xa0]\n"
- "sqadd v29.4s, v29.4s, v6.4s\n"
- "ldr q24, [%x[params], #0xb0]\n"
- "sqadd v21.4s, v21.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x90]\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v21.4s, v21.4s, v20.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
"smin v26.4s, v26.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q31, [%x[params], #0x20]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x6e839596 // udot v22.4s, v12.16b, v3.16b\n"
- ".inst 0x6e809596 // udot v22.4s, v12.16b, v0.16b\n"
+ "str s3, [x9, x12]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s26, [x25, x27]\n"
- "ldr q26, [%x[params], #0x80]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "mov v18.16b, v22.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- ".inst 0x6e879596 // udot v22.4s, v12.16b, v7.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s29, [x23, x27]\n"
- "mov v6.16b, v26.16b\n"
- "str s21, [x22, x27]\n"
- "mov v25.16b, v26.16b\n"
- "mov v20.16b, v26.16b\n"
- ".inst 0x6e8795fa // udot v26.4s, v15.16b, v7.16b\n"
- ".inst 0x6e8395f9 // udot v25.4s, v15.16b, v3.16b\n"
- ".inst 0x6e83979a // udot v26.4s, v28.16b, v3.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x6e8795e6 // udot v6.4s, v15.16b, v7.16b\n"
- ".inst 0x6e8395f4 // udot v20.4s, v15.16b, v3.16b\n"
- ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
- ".inst 0x6e809799 // udot v25.4s, v28.16b, v0.16b\n"
- ".inst 0x6e80971a // udot v26.4s, v24.16b, v0.16b\n"
- "ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x6e839786 // udot v6.4s, v28.16b, v3.16b\n"
- "ldr q19, [x26, x28]\n"
- ".inst 0x6e809794 // udot v20.4s, v28.16b, v0.16b\n"
- "mls v26.4s, v22.4s, v16.4s\n"
- ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
- ".inst 0x6e829719 // udot v25.4s, v24.16b, v2.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "mov v8.16b, v31.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v16.16b, v31.16b\n"
+ "str s23, [x28, x12]\n"
+ "mov v26.16b, v31.16b\n"
+ ".inst 0x6e82963f // udot v31.4s, v17.16b, v2.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e959628 // udot v8.4s, v17.16b, v21.16b\n"
"ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x6e809706 // udot v6.4s, v24.16b, v0.16b\n"
- ".inst 0x6e829714 // udot v20.4s, v24.16b, v2.16b\n"
- "sqrdmulh v26.4s, v26.4s, v27.4s\n"
- "mov v17.16b, v23.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
- "ldr q21, [x13, x28]\n"
- "mls v6.4s, v23.4s, v16.4s\n"
- "mls v25.4s, v18.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v26.16b, v1.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v6.4s, v6.4s, v27.4s\n"
- "sqrdmulh v25.4s, v25.4s, v27.4s\n"
- "sqrdmulh v20.4s, v20.4s, v27.4s\n"
- "ldr q15, [%x[params], #0x120]\n"
- "sqadd v26.4s, v26.4s, v17.4s\n"
- "and v18.16b, v6.16b, v1.16b\n"
- "and v22.16b, v25.16b, v1.16b\n"
- "and v17.16b, v20.16b, v1.16b\n"
+ ".inst 0x6e9597df // udot v31.4s, v30.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e829630 // udot v16.4s, v17.16b, v2.16b\n"
+ ".inst 0x6e95963a // udot v26.4s, v17.16b, v21.16b\n"
+ ".inst 0x6e9595f4 // udot v20.4s, v15.16b, v21.16b\n"
+ ".inst 0x6e9d97c8 // udot v8.4s, v30.16b, v29.16b\n"
+ ".inst 0x6e9d94bf // udot v31.4s, v5.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e9597d0 // udot v16.4s, v30.16b, v21.16b\n"
+ "ldr q3, [x24, x13]\n"
+ ".inst 0x6e9d97da // udot v26.4s, v30.16b, v29.16b\n"
+ ".inst 0x6e9d95f4 // udot v20.4s, v15.16b, v29.16b\n"
+ ".inst 0x6e9294a8 // udot v8.4s, v5.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e9d94b0 // udot v16.4s, v5.16b, v29.16b\n"
+ ".inst 0x6e9294ba // udot v26.4s, v5.16b, v18.16b\n"
+ "mov v17.16b, v20.16b\n .inst 0x6e9295f1 // udot v17.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295f4 // udot v20.4s, v15.16b, v2.16b\n"
+ "ldr q2, [x14, x13]\n"
+ ".inst 0x6e9c95f7 // udot v23.4s, v15.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v10.4s\n"
+ "mls v8.4s, v22.4s, v24.4s\n"
+ "mls v26.4s, v17.4s, v24.4s\n"
+ "and v18.16b, v31.16b, v1.16b\n"
+ "mls v16.4s, v20.4s, v24.4s\n"
+ "movi v21.4s, #0x0\n"
+ "sqrdmulh v8.4s, v8.4s, v10.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v10.4s\n"
+ ".inst 0x6e9b95f7 // udot v23.4s, v15.16b, v27.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqrdmulh v16.4s, v16.4s, v10.4s\n"
+ "ldr q13, [%x[params], #0xc0]\n"
+ "and v17.16b, v8.16b, v1.16b\n"
+ "sqadd v31.4s, v31.4s, v18.4s\n"
+ "and v20.16b, v26.16b, v1.16b\n"
+ "and v10.16b, v16.16b, v1.16b\n"
+ "mov v19.16b, v23.16b\n .inst 0x6e8e95f3 // udot v19.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095f7 // udot v23.4s, v15.16b, v0.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "ldr q30, [%x[params], #0xb0]\n"
+ "sqadd v16.4s, v16.4s, v10.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "srshl v8.4s, v8.4s, v1.4s\n"
+ "srshl v16.4s, v16.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
"srshl v26.4s, v26.4s, v1.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "ldr q30, [%x[params], #0x100]\n"
- "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "add v8.4s, v8.4s, v12.4s\n"
+ "add v16.4s, v16.4s, v12.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v8.4s, v8.4s, v7.4s\n"
+ "smax v16.4s, v16.4s, v7.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v8.4s, v8.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q10, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s16, [x10, x12]\n"
+ "mov v18.16b, v10.16b\n"
+ "str s8, [x9, x12]\n"
+ "mov v8.16b, v10.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v26.16b, v10.16b\n"
+ ".inst 0x6e80968a // udot v10.4s, v20.16b, v0.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e9c9688 // udot v8.4s, v20.16b, v28.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e9c962a // udot v10.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x6e809692 // udot v18.4s, v20.16b, v0.16b\n"
+ ".inst 0x6e9c969a // udot v26.4s, v20.16b, v28.16b\n"
+ ".inst 0x6e9c95f5 // udot v21.4s, v15.16b, v28.16b\n"
+ ".inst 0x6e9b9628 // udot v8.4s, v17.16b, v27.16b\n"
+ ".inst 0x6e9b97ca // udot v10.4s, v30.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9c9632 // udot v18.4s, v17.16b, v28.16b\n"
+ "ldr q28, [x23, x13]\n"
+ ".inst 0x6e9b963a // udot v26.4s, v17.16b, v27.16b\n"
+ ".inst 0x6e9b95f5 // udot v21.4s, v15.16b, v27.16b\n"
+ ".inst 0x6e8e97c8 // udot v8.4s, v30.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "mls v10.4s, v23.4s, v24.4s\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e9b97d2 // udot v18.4s, v30.16b, v27.16b\n"
+ ".inst 0x6e8e97da // udot v26.4s, v30.16b, v14.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x6e8e95f0 // udot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095f5 // udot v21.4s, v15.16b, v0.16b\n"
+ "ldr q29, [x27, x13]\n"
+ ".inst 0x6e8495e1 // udot v1.4s, v15.16b, v4.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v13.4s\n"
+ "mls v8.4s, v19.4s, v24.4s\n"
+ "mls v26.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v10.16b, v22.16b\n"
+ "mls v18.4s, v21.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ "sqrdmulh v8.4s, v8.4s, v13.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v13.4s\n"
+ ".inst 0x6e8695e1 // udot v1.4s, v15.16b, v6.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v18.4s, v18.4s, v13.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "and v17.16b, v8.16b, v22.16b\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v22.16b\n"
+ "and v16.16b, v18.16b, v22.16b\n"
+ "mov v19.16b, v1.16b\n .inst 0x6e9995f3 // udot v19.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995e1 // udot v1.4s, v15.16b, v9.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
"ldr q27, [%x[params], #0x110]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q24, [%x[params], #0xf0]\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "srshl v6.4s, v6.4s, v1.4s\n"
- "srshl v25.4s, v25.4s, v1.4s\n"
- "srshl v20.4s, v20.4s, v1.4s\n"
- "ldr q23, [%x[params], #0x130]\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "add v6.4s, v6.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
+ "sqadd v18.4s, v18.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v10.4s, v10.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "srshl v8.4s, v8.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "smax v10.4s, v10.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "ldr q31, [%x[params], #0x130]\n"
+ "add v8.4s, v8.4s, v12.4s\n"
+ "add v18.4s, v18.4s, v12.4s\n"
+ "smin v10.4s, v10.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v8.4s, v8.4s, v7.4s\n"
+ "smax v18.4s, v18.4s, v7.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v8.4s, v8.4s, v11.4s\n"
+ "smin v18.4s, v18.4s, v11.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
"smin v26.4s, v26.4s, v11.4s\n"
- "smax v6.4s, v6.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v6.4s, v6.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s10, [x11, x12]\n"
+ "ldr q0, [%x[params], #0xe0]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "movi v0.4s, #0x0\n"
- ".inst 0x6e8a9580 // udot v0.4s, v12.16b, v10.16b\n"
- ".inst 0x6e859580 // udot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str s26, [x25, x27]\n"
- "ldr q28, [%x[params], #0xe0]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v22.16b, v0.16b\n .inst 0x6e899596 // udot v22.4s, v12.16b, v9.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s6, [x24, x27]\n"
- ".inst 0x6e889580 // udot v0.4s, v12.16b, v8.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s25, [x23, x27]\n"
- "mov v29.16b, v28.16b\n"
- "str s20, [x22, x27]\n"
- "mov v25.16b, v28.16b\n"
- "mov v7.16b, v28.16b\n"
- ".inst 0x6e88971c // udot v28.4s, v24.16b, v8.16b\n"
- ".inst 0x6e8a9719 // udot v25.4s, v24.16b, v10.16b\n"
- ".inst 0x6e8a97dc // udot v28.4s, v30.16b, v10.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e88971d // udot v29.4s, v24.16b, v8.16b\n"
- ".inst 0x6e8a9707 // udot v7.4s, v24.16b, v10.16b\n"
- ".inst 0x6e8a9591 // udot v17.4s, v12.16b, v10.16b\n"
- ".inst 0x6e8597d9 // udot v25.4s, v30.16b, v5.16b\n"
- ".inst 0x6e85977c // udot v28.4s, v27.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x6e8a97dd // udot v29.4s, v30.16b, v10.16b\n"
- "ldr q10, [x21, x28]\n"
- ".inst 0x6e8597c7 // udot v7.4s, v30.16b, v5.16b\n"
- "mls v28.4s, v0.4s, v16.4s\n"
- ".inst 0x6e859591 // udot v17.4s, v12.16b, v5.16b\n"
- ".inst 0x6e899779 // udot v25.4s, v27.16b, v9.16b\n"
+ "str s18, [x10, x12]\n"
+ "mov v22.16b, v0.16b\n"
+ "str s8, [x9, x12]\n"
+ "mov v23.16b, v0.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v14.16b, v0.16b\n"
+ ".inst 0x6e899600 // udot v0.4s, v16.16b, v9.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e849617 // udot v23.4s, v16.16b, v4.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e85977d // udot v29.4s, v27.16b, v5.16b\n"
- ".inst 0x6e899767 // udot v7.4s, v27.16b, v9.16b\n"
- "sqrdmulh v28.4s, v28.4s, v15.4s\n"
- "mov v18.16b, v17.16b\n .inst 0x6e899592 // udot v18.4s, v12.16b, v9.16b\n"
- ".inst 0x6e889591 // udot v17.4s, v12.16b, v8.16b\n"
- "ldr q8, [x12, x28]\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mls v25.4s, v22.4s, v16.4s\n"
- "mls v7.4s, v18.4s, v16.4s\n"
- "and v17.16b, v28.16b, v23.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v29.4s, v29.4s, v15.4s\n"
- "sqrdmulh v25.4s, v25.4s, v15.4s\n"
- "sqrdmulh v7.4s, v7.4s, v15.4s\n"
- "ldr q15, [x15, x28]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldp x21, x20, [%x[inptrs], #0x40]\n"
- "ldr q22, [x21, x28]\n"
- "ldr q3, [x20, x28]\n"
- "and v24.16b, v29.16b, v23.16b\n"
- "and v20.16b, v25.16b, v23.16b\n"
- "and v17.16b, v7.16b, v23.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
+ ".inst 0x6e849620 // udot v0.4s, v17.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x6e899616 // udot v22.4s, v16.16b, v9.16b\n"
+ ".inst 0x6e84960e // udot v14.4s, v16.16b, v4.16b\n"
+ ".inst 0x6e8495e5 // udot v5.4s, v15.16b, v4.16b\n"
+ ".inst 0x6e869637 // udot v23.4s, v17.16b, v6.16b\n"
+ ".inst 0x6e869760 // udot v0.4s, v27.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e849636 // udot v22.4s, v17.16b, v4.16b\n"
+ "ldr q4, [x22, x13]\n"
+ ".inst 0x6e86962e // udot v14.4s, v17.16b, v6.16b\n"
+ ".inst 0x6e8695e5 // udot v5.4s, v15.16b, v6.16b\n"
+ ".inst 0x6e999777 // udot v23.4s, v27.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "mls v0.4s, v1.4s, v24.4s\n"
+ ".inst 0x6e869776 // udot v22.4s, v27.16b, v6.16b\n"
+ ".inst 0x6e99976e // udot v14.4s, v27.16b, v25.16b\n"
+ "mov v17.16b, v5.16b\n .inst 0x6e9995f1 // udot v17.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995e5 // udot v5.4s, v15.16b, v9.16b\n"
+ "ldr q9, [x26, x13]\n"
+ "sqrdmulh v0.4s, v0.4s, v30.4s\n"
+ "mls v23.4s, v19.4s, v24.4s\n"
+ "and v16.16b, v0.16b, v31.16b\n"
+ "mls v22.4s, v5.4s, v24.4s\n"
+ "mls v14.4s, v17.4s, v24.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v30.4s\n"
+ "ldr q13, [x15, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
- "ldr q2, [x21, x28]\n"
- "ldr q5, [x20, x28]\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v28.4s, v28.4s, v23.4s\n"
- "sqadd v29.4s, v29.4s, v24.4s\n"
- "ldr q6, [%x[params], #0x160]\n"
- "sqadd v25.4s, v25.4s, v20.4s\n"
- "ldr q20, [%x[params], #0x170]\n"
- "sqadd v7.4s, v7.4s, v17.4s\n"
- "ldr q1, [%x[params], #0x150]\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "srshl v29.4s, v29.4s, v23.4s\n"
- "srshl v25.4s, v25.4s, v23.4s\n"
- "srshl v7.4s, v7.4s, v23.4s\n"
- "ldr q26, [x10, x28]\n"
- "ldp x21, x20, [%x[inptrs], #0x60]\n"
- "ldr q27, [x21, x28]\n"
- "ldr q30, [x20, x28]\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v7.4s, v7.4s, v14.4s\n"
+ "sqadd v0.4s, v0.4s, v16.4s\n"
+ "and v19.16b, v23.16b, v31.16b\n"
+ "ldr q10, [x23, x13]\n"
+ "ldr q26, [x22, x13]\n"
+ "and v21.16b, v22.16b, v31.16b\n"
+ "and v16.16b, v14.16b, v31.16b\n"
+ "ldr q20, [x21, x13]\n"
+ "ldr q6, [x20, x13]\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "srshl v0.4s, v0.4s, v31.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v19.4s\n"
+ "ldr q17, [%x[params], #0x170]\n"
+ "add v0.4s, v0.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v21.4s\n"
+ "ldr q8, [%x[params], #0x160]\n"
+ "sqadd v14.4s, v14.4s, v16.4s\n"
+ "ldr q30, [%x[params], #0x150]\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
+ "smax v0.4s, v0.4s, v7.4s\n"
+ "srshl v22.4s, v22.4s, v31.4s\n"
+ "srshl v14.4s, v14.4s, v31.4s\n"
+ "ldr q1, [x25, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "ldr q23, [x21, x28]\n"
- "ldr q9, [x20, x28]\n"
- "smin v28.4s, v28.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
"ldp x15, x14, [%x[inptrs], #0x0]\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v7.4s, v7.4s, v13.4s\n"
- "ldp x13, x12, [%x[inptrs], #0x10]\n"
- "ldp x10, x9, [%x[inptrs], #0x20]\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v25.4s, v25.4s, v11.4s\n"
- "ldp x26, x21, [%x[inptrs], #0x30]\n"
- "smin v7.4s, v7.4s, v11.4s\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s28, [x25, x27]\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "zip2 v17.16b, v15.16b, v21.16b\n"
- "zip1 v15.16b, v15.16b, v21.16b\n"
- "zip1 v18.16b, v31.16b, v8.16b\n"
- "zip2 v8.16b, v31.16b, v8.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s29, [x24, x27]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str s25, [x23, x27]\n"
- "zip2 v25.16b, v15.16b, v18.16b\n"
- "str s7, [x22, x27]\n"
- "zip1 v15.16b, v15.16b, v18.16b\n"
- "zip1 v7.16b, v17.16b, v8.16b\n"
- "add x27, x27, #0x4\n"
- "zip2 v8.16b, v17.16b, v8.16b\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v0.4s, v0.4s, v11.4s\n"
+ "ldp x27, x26, [%x[inptrs], #0x10]\n"
+ "ldr q5, [x23, x13]\n"
+ "ldr q27, [x22, x13]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v14.4s, v14.4s, v12.4s\n"
+ "ldp x25, x24, [%x[inptrs], #0x20]\n"
+ "ldr q16, [x21, x13]\n"
+ "ldr q25, [x20, x13]\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "ldp x23, x22, [%x[inptrs], #0x30]\n"
+ "smax v22.4s, v22.4s, v7.4s\n"
+ "smax v14.4s, v14.4s, v7.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v14.4s, v14.4s, v11.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s0, [x11, x12]\n"
+ "zip2 v18.16b, v13.16b, v29.16b\n"
+ "zip1 v13.16b, v13.16b, v29.16b\n"
+ "zip1 v0.16b, v2.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "zip2 v9.16b, v2.16b, v9.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "zip2 v2.16b, v13.16b, v0.16b\n"
+ "zip1 v13.16b, v13.16b, v0.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str s23, [x9, x12]\n"
+ "zip1 v0.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
"ldr q31, [%x[params], #0x140]\n"
- "zip2 v29.16b, v26.16b, v19.16b\n"
"add %x[params], %x[params], #0x180\n"
- "zip1 v26.16b, v26.16b, v19.16b\n"
- "zip1 v28.16b, v4.16b, v10.16b\n"
- "zip2 v10.16b, v4.16b, v10.16b\n"
- "zip2 v24.16b, v22.16b, v2.16b\n"
- "zip1 v22.16b, v22.16b, v2.16b\n"
- "zip1 v21.16b, v3.16b, v5.16b\n"
- "zip2 v5.16b, v3.16b, v5.16b\n"
- "zip2 v18.16b, v27.16b, v23.16b\n"
- "zip1 v27.16b, v27.16b, v23.16b\n"
- "zip1 v17.16b, v30.16b, v9.16b\n"
- "zip2 v9.16b, v30.16b, v9.16b\n"
- "zip2 v23.16b, v26.16b, v28.16b\n"
- "zip1 v26.16b, v26.16b, v28.16b\n"
- "zip1 v3.16b, v29.16b, v10.16b\n"
- "zip2 v10.16b, v29.16b, v10.16b\n"
- "zip2 v19.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v22.16b, v21.16b\n"
- "zip1 v0.16b, v24.16b, v5.16b\n"
- "zip2 v5.16b, v24.16b, v5.16b\n"
- "zip2 v24.16b, v27.16b, v17.16b\n"
- "zip1 v27.16b, v27.16b, v17.16b\n"
- "zip1 v2.16b, v18.16b, v9.16b\n"
- "zip2 v9.16b, v18.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
+ "zip2 v23.16b, v10.16b, v20.16b\n"
+ "zip1 v10.16b, v10.16b, v20.16b\n"
+ "str s22, [x10, x12]\n"
+ "str s14, [x28, x12]\n"
+ "zip2 v22.16b, v1.16b, v28.16b\n"
+ "zip1 v1.16b, v1.16b, v28.16b\n"
+ "add x12, x12, #0x4\n"
+ "zip1 v20.16b, v3.16b, v4.16b\n"
+ "zip2 v4.16b, v3.16b, v4.16b\n"
+ "zip1 v14.16b, v26.16b, v6.16b\n"
+ "zip2 v6.16b, v26.16b, v6.16b\n"
+ "zip2 v19.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v16.16b, v27.16b, v25.16b\n"
+ "zip2 v25.16b, v27.16b, v25.16b\n"
+ "zip2 v21.16b, v1.16b, v20.16b\n"
+ "zip1 v1.16b, v1.16b, v20.16b\n"
+ "zip1 v28.16b, v22.16b, v4.16b\n"
+ "zip2 v4.16b, v22.16b, v4.16b\n"
+ "zip2 v29.16b, v10.16b, v14.16b\n"
+ "zip1 v10.16b, v10.16b, v14.16b\n"
+ "zip1 v27.16b, v23.16b, v6.16b\n"
+ "zip2 v6.16b, v23.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "zip1 v14.16b, v19.16b, v25.16b\n"
+ "zip2 v25.16b, v19.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
"bgt 1b\n"
"2:" // Detached iteration
- "movi v21.4s, #0x0\n"
- ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
- ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e8d97df // udot v31.4s, v30.16b, v13.16b\n"
+ ".inst 0x6e8197c3 // udot v3.4s, v30.16b, v1.16b\n"
"tst %x[n_channels], #0xf\n"
- ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
- ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
- "movi v18.4s, #0x0\n"
- "add x28, x28, #0x10\n"
- ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
- ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
- ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
- ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
- "mls v31.4s, v21.4s, v16.4s\n"
- ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
- ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
- "ldr q4, [%x[params], #0x10]\n"
- ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
- ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
- "ldr q17, [%x[params], #0x0]\n"
- "sqrdmulh v31.4s, v31.4s, v17.4s\n"
- ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
- ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v21.4s, v16.4s\n"
- "and v27.16b, v31.16b, v4.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v17.4s\n"
- "sqrdmulh v29.4s, v29.4s, v17.4s\n"
- "sqrdmulh v28.4s, v28.4s, v17.4s\n"
- "ldr q15, [%x[params], #0x60]\n"
- "sqadd v31.4s, v31.4s, v27.4s\n"
- "and v20.16b, v30.16b, v4.16b\n"
- "and v18.16b, v29.16b, v4.16b\n"
- "and v17.16b, v28.16b, v4.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "movi v20.4s, #0x0\n"
+ "add x13, x13, #0x10\n"
+ ".inst 0x6e8195f3 // udot v19.4s, v15.16b, v1.16b\n"
+ ".inst 0x6e81951f // udot v31.4s, v8.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x6e8a9503 // udot v3.4s, v8.16b, v10.16b\n"
+ ".inst 0x6e8a95f3 // udot v19.4s, v15.16b, v10.16b\n"
+ ".inst 0x6e8195f4 // udot v20.4s, v15.16b, v1.16b\n"
+ ".inst 0x6e8a963f // udot v31.4s, v17.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e8197d7 // udot v23.4s, v30.16b, v1.16b\n"
+ "mov v16.16b, v19.16b\n .inst 0x6e8595f0 // udot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95f3 // udot v19.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6e859623 // udot v3.4s, v17.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a95f4 // udot v20.4s, v15.16b, v10.16b\n"
+ ".inst 0x6e8d97da // udot v26.4s, v30.16b, v13.16b\n"
+ ".inst 0x6e8a9517 // udot v23.4s, v8.16b, v10.16b\n"
+ "mls v31.4s, v19.4s, v24.4s\n"
+ "movi v30.4s, #0x0\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x6e81951a // udot v26.4s, v8.16b, v1.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "mov v16.16b, v20.16b\n .inst 0x6e8595f0 // udot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95f4 // udot v20.4s, v15.16b, v13.16b\n"
+ "ldr q8, [%x[params], #0x0]\n"
+ ".inst 0x6e9595fe // udot v30.4s, v15.16b, v21.16b\n"
+ ".inst 0x6e859637 // udot v23.4s, v17.16b, v5.16b\n"
+ ".inst 0x6e8a963a // udot v26.4s, v17.16b, v10.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v8.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ ".inst 0x6e9d95fe // udot v30.4s, v15.16b, v29.16b\n"
+ "mls v26.4s, v20.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "and v22.16b, v3.16b, v1.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "ldr q20, [%x[params], #0x60]\n"
+ "mov v19.16b, v30.16b\n .inst 0x6e9295f3 // udot v19.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295fe // udot v30.4s, v15.16b, v2.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v1.16b\n"
+ "and v16.16b, v26.16b, v1.16b\n"
+ "sqadd v3.4s, v3.4s, v22.4s\n"
+ "ldr q8, [%x[params], #0x50]\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqadd v30.4s, v30.4s, v20.4s\n"
- "ldr q27, [%x[params], #0x40]\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "ldr q26, [%x[params], #0x50]\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "ldr q6, [%x[params], #0x30]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
- "srshl v29.4s, v29.4s, v4.4s\n"
- "srshl v28.4s, v28.4s, v4.4s\n"
- "ldr q4, [%x[params], #0x70]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v1.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x40]\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v1.4s, #0x0\n"
- ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s31, [x25, x27]\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s31, [x11, x12]\n"
"ldr q31, [%x[params], #0x20]\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s30, [x24, x27]\n"
- "mov v22.16b, v1.16b\n .inst 0x6e989596 // udot v22.4s, v12.16b, v24.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s29, [x23, x27]\n"
- "mov v29.16b, v31.16b\n"
- ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
- "str s28, [x22, x27]\n"
- "mov v21.16b, v31.16b\n"
- "mov v20.16b, v31.16b\n"
- ".inst 0x6e9994df // udot v31.4s, v6.16b, v25.16b\n"
- ".inst 0x6e9794d5 // udot v21.4s, v6.16b, v23.16b\n"
- ".inst 0x6e97977f // udot v31.4s, v27.16b, v23.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x6e9994dd // udot v29.4s, v6.16b, v25.16b\n"
- ".inst 0x6e9794d4 // udot v20.4s, v6.16b, v23.16b\n"
- ".inst 0x6e979592 // udot v18.4s, v12.16b, v23.16b\n"
- ".inst 0x6e939775 // udot v21.4s, v27.16b, v19.16b\n"
- ".inst 0x6e93975f // udot v31.4s, v26.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
- ".inst 0x6e939774 // udot v20.4s, v27.16b, v19.16b\n"
- "mls v31.4s, v1.4s, v16.4s\n"
- ".inst 0x6e939592 // udot v18.4s, v12.16b, v19.16b\n"
- ".inst 0x6e989755 // udot v21.4s, v26.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
- ".inst 0x6e989754 // udot v20.4s, v26.16b, v24.16b\n"
- "sqrdmulh v31.4s, v31.4s, v15.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x6e999592 // udot v18.4s, v12.16b, v25.16b\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v21.4s, v22.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v4.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v29.4s, v29.4s, v15.4s\n"
- "sqrdmulh v21.4s, v21.4s, v15.4s\n"
- "sqrdmulh v20.4s, v20.4s, v15.4s\n"
- "ldr q27, [%x[params], #0xc0]\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v29.16b, v4.16b\n"
- "and v18.16b, v21.16b, v4.16b\n"
- "and v17.16b, v20.16b, v4.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s3, [x9, x12]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "mov v10.16b, v31.16b\n"
+ "str s26, [x10, x12]\n"
+ "mov v1.16b, v31.16b\n"
+ "str s23, [x28, x12]\n"
+ "mov v26.16b, v31.16b\n"
+ ".inst 0x6e82963f // udot v31.4s, v17.16b, v2.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e95962a // udot v10.4s, v17.16b, v21.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e95961f // udot v31.4s, v16.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e829621 // udot v1.4s, v17.16b, v2.16b\n"
+ ".inst 0x6e95963a // udot v26.4s, v17.16b, v21.16b\n"
+ ".inst 0x6e9595e5 // udot v5.4s, v15.16b, v21.16b\n"
+ ".inst 0x6e9d960a // udot v10.4s, v16.16b, v29.16b\n"
+ ".inst 0x6e9d951f // udot v31.4s, v8.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e959601 // udot v1.4s, v16.16b, v21.16b\n"
+ ".inst 0x6e9d961a // udot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x6e9d95e5 // udot v5.4s, v15.16b, v29.16b\n"
+ ".inst 0x6e92950a // udot v10.4s, v8.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "mls v31.4s, v30.4s, v24.4s\n"
+ "movi v3.4s, #0x0\n"
+ ".inst 0x6e9d9501 // udot v1.4s, v8.16b, v29.16b\n"
+ ".inst 0x6e92951a // udot v26.4s, v8.16b, v18.16b\n"
+ "mov v16.16b, v5.16b\n .inst 0x6e9295f0 // udot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295e5 // udot v5.4s, v15.16b, v2.16b\n"
+ ".inst 0x6e9c95e3 // udot v3.4s, v15.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mls v10.4s, v19.4s, v24.4s\n"
+ "mls v26.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v22.16b\n"
+ "mls v1.4s, v5.4s, v24.4s\n"
+ "movi v2.4s, #0x0\n"
+ "sqrdmulh v10.4s, v10.4s, v20.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v20.4s\n"
+ ".inst 0x6e9b95e3 // udot v3.4s, v15.16b, v27.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "and v17.16b, v10.16b, v22.16b\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v22.16b\n"
+ "and v16.16b, v1.16b, v22.16b\n"
+ "mov v19.16b, v3.16b\n .inst 0x6e8e95f3 // udot v19.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095e3 // udot v3.4s, v15.16b, v0.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v4.4s\n"
- "sqadd v29.4s, v29.4s, v19.4s\n"
- "ldr q26, [%x[params], #0xa0]\n"
- "sqadd v21.4s, v21.4s, v18.4s\n"
- "ldr q25, [%x[params], #0xb0]\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q24, [%x[params], #0x90]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v29.4s, v29.4s, v4.4s\n"
- "srshl v21.4s, v21.4s, v4.4s\n"
- "srshl v20.4s, v20.4s, v4.4s\n"
- "ldr q1, [%x[params], #0xd0]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v17.4s\n"
+ "ldr q18, [%x[params], #0xb0]\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0xa0]\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v22.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "add v10.4s, v10.4s, v12.4s\n"
+ "add v1.4s, v1.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v10.4s, v10.4s, v7.4s\n"
+ "smax v1.4s, v1.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
- ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v10.4s, v10.4s, v11.4s\n"
+ "smin v1.4s, v1.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q31, [%x[params], #0x80]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v22.16b, v23.16b\n .inst 0x6e829596 // udot v22.4s, v12.16b, v2.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s29, [x24, x27]\n"
- ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s21, [x23, x27]\n"
- "mov v21.16b, v31.16b\n"
- "str s20, [x22, x27]\n"
- "mov v4.16b, v31.16b\n"
- "mov v20.16b, v31.16b\n"
- ".inst 0x6e87971f // udot v31.4s, v24.16b, v7.16b\n"
- ".inst 0x6e839704 // udot v4.4s, v24.16b, v3.16b\n"
- ".inst 0x6e83975f // udot v31.4s, v26.16b, v3.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x6e879715 // udot v21.4s, v24.16b, v7.16b\n"
- ".inst 0x6e839714 // udot v20.4s, v24.16b, v3.16b\n"
- ".inst 0x6e839592 // udot v18.4s, v12.16b, v3.16b\n"
- ".inst 0x6e809744 // udot v4.4s, v26.16b, v0.16b\n"
- ".inst 0x6e80973f // udot v31.4s, v25.16b, v0.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "str s31, [x11, x12]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s1, [x10, x12]\n"
+ "mov v30.16b, v21.16b\n"
+ "str s10, [x9, x12]\n"
+ "mov v20.16b, v21.16b\n"
+ "str s26, [x28, x12]\n"
+ "mov v29.16b, v21.16b\n"
+ ".inst 0x6e809615 // udot v21.4s, v16.16b, v0.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n"
"ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x6e839755 // udot v21.4s, v26.16b, v3.16b\n"
- ".inst 0x6e809754 // udot v20.4s, v26.16b, v0.16b\n"
- "mls v31.4s, v23.4s, v16.4s\n"
- ".inst 0x6e809592 // udot v18.4s, v12.16b, v0.16b\n"
- ".inst 0x6e829724 // udot v4.4s, v25.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x6e809735 // udot v21.4s, v25.16b, v0.16b\n"
- ".inst 0x6e829734 // udot v20.4s, v25.16b, v2.16b\n"
- "sqrdmulh v31.4s, v31.4s, v27.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x6e879592 // udot v18.4s, v12.16b, v7.16b\n"
- "mls v21.4s, v18.4s, v16.4s\n"
- "mls v4.4s, v22.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v1.16b\n"
+ ".inst 0x6e9c9635 // udot v21.4s, v17.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x6e80961e // udot v30.4s, v16.16b, v0.16b\n"
+ ".inst 0x6e9c961d // udot v29.4s, v16.16b, v28.16b\n"
+ ".inst 0x6e9c95e2 // udot v2.4s, v15.16b, v28.16b\n"
+ ".inst 0x6e9b9634 // udot v20.4s, v17.16b, v27.16b\n"
+ ".inst 0x6e9b9655 // udot v21.4s, v18.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9c963e // udot v30.4s, v17.16b, v28.16b\n"
+ ".inst 0x6e9b963d // udot v29.4s, v17.16b, v27.16b\n"
+ ".inst 0x6e9b95e2 // udot v2.4s, v15.16b, v27.16b\n"
+ ".inst 0x6e8e9654 // udot v20.4s, v18.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "mls v21.4s, v3.4s, v24.4s\n"
+ "movi v5.4s, #0x0\n"
+ ".inst 0x6e9b965e // udot v30.4s, v18.16b, v27.16b\n"
+ ".inst 0x6e8e965d // udot v29.4s, v18.16b, v14.16b\n"
+ "mov v16.16b, v2.16b\n .inst 0x6e8e95f0 // udot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095e2 // udot v2.4s, v15.16b, v0.16b\n"
+ ".inst 0x6e8495e5 // udot v5.4s, v15.16b, v4.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v23.4s\n"
+ "mls v20.4s, v19.4s, v24.4s\n"
+ "mls v29.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v21.16b, v22.16b\n"
+ "mls v30.4s, v2.4s, v24.4s\n"
+ "movi v27.4s, #0x0\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v23.4s\n"
+ ".inst 0x6e8695e5 // udot v5.4s, v15.16b, v6.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v23.4s\n"
+ "ldr q26, [%x[params], #0x120]\n"
+ "and v17.16b, v20.16b, v22.16b\n"
+ "sqadd v21.4s, v21.4s, v16.4s\n"
+ "and v19.16b, v29.16b, v22.16b\n"
+ "and v16.16b, v30.16b, v22.16b\n"
+ "mov v14.16b, v5.16b\n .inst 0x6e9995ee // udot v14.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995e5 // udot v5.4s, v15.16b, v9.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v27.4s\n"
- "sqrdmulh v4.4s, v4.4s, v27.4s\n"
- "sqrdmulh v20.4s, v20.4s, v27.4s\n"
- "ldr q30, [%x[params], #0x120]\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v21.16b, v1.16b\n"
- "and v18.16b, v4.16b, v1.16b\n"
- "and v17.16b, v20.16b, v1.16b\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "sqadd v21.4s, v21.4s, v19.4s\n"
- "ldr q29, [%x[params], #0x100]\n"
- "sqadd v4.4s, v4.4s, v18.4s\n"
- "ldr q28, [%x[params], #0x110]\n"
"sqadd v20.4s, v20.4s, v17.4s\n"
- "ldr q27, [%x[params], #0xf0]\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v4.4s, v4.4s, v1.4s\n"
- "srshl v20.4s, v20.4s, v1.4s\n"
- "ldr q26, [%x[params], #0x130]\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v4.4s, v4.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "smin v31.4s, v31.4s, v11.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v4.4s, v4.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
+ "ldr q18, [%x[params], #0x110]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q17, [%x[params], #0x100]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "smax v21.4s, v21.4s, v7.4s\n"
+ "srshl v29.4s, v29.4s, v22.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
"smin v21.4s, v21.4s, v11.4s\n"
- "smin v4.4s, v4.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "smax v20.4s, v20.4s, v7.4s\n"
+ "smax v30.4s, v30.4s, v7.4s\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "smax v29.4s, v29.4s, v7.4s\n"
"smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
- "movi v25.4s, #0x0\n"
- ".inst 0x6e8a9599 // udot v25.4s, v12.16b, v10.16b\n"
- ".inst 0x6e859599 // udot v25.4s, v12.16b, v5.16b\n"
- "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s31, [x25, x27]\n"
- "ldr q24, [%x[params], #0xe0]\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "mov v23.16b, v25.16b\n .inst 0x6e899597 // udot v23.4s, v12.16b, v9.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s21, [x11, x12]\n"
+ "ldr q22, [%x[params], #0xe0]\n"
"add %x[params], %x[params], #0x140\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v4.16b, v4.16b, v4.16b\n"
- "str s21, [x24, x27]\n"
- ".inst 0x6e889599 // udot v25.4s, v12.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s4, [x23, x27]\n"
- "mov v22.16b, v24.16b\n"
- "str s20, [x22, x27]\n"
- "mov v21.16b, v24.16b\n"
- "mov v20.16b, v24.16b\n"
- ".inst 0x6e889778 // udot v24.4s, v27.16b, v8.16b\n"
- ".inst 0x6e8a9775 // udot v21.4s, v27.16b, v10.16b\n"
- ".inst 0x6e8a97b8 // udot v24.4s, v29.16b, v10.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- "add x27, x27, #0x4\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x6e889776 // udot v22.4s, v27.16b, v8.16b\n"
- ".inst 0x6e8a9774 // udot v20.4s, v27.16b, v10.16b\n"
- ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
- ".inst 0x6e8597b5 // udot v21.4s, v29.16b, v5.16b\n"
- ".inst 0x6e859798 // udot v24.4s, v28.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x6e8a97b6 // udot v22.4s, v29.16b, v10.16b\n"
- ".inst 0x6e8597b4 // udot v20.4s, v29.16b, v5.16b\n"
- "mls v24.4s, v25.4s, v16.4s\n"
- ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
- ".inst 0x6e899795 // udot v21.4s, v28.16b, v9.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s20, [x9, x12]\n"
+ "mov v21.16b, v22.16b\n"
+ "str s30, [x10, x12]\n"
+ "mov v20.16b, v22.16b\n"
+ "str s29, [x28, x12]\n"
+ "mov v19.16b, v22.16b\n"
+ ".inst 0x6e899616 // udot v22.4s, v16.16b, v9.16b\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0x6e849615 // udot v21.4s, v16.16b, v4.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e859796 // udot v22.4s, v28.16b, v5.16b\n"
- ".inst 0x6e899794 // udot v20.4s, v28.16b, v9.16b\n"
- "sqrdmulh v24.4s, v24.4s, v30.4s\n"
- "mov v17.16b, v18.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
- ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
- "mls v22.4s, v18.4s, v16.4s\n"
- "mls v21.4s, v23.4s, v16.4s\n"
- "mls v20.4s, v17.4s, v16.4s\n"
- "and v17.16b, v24.16b, v26.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v30.4s\n"
- "sqrdmulh v21.4s, v21.4s, v30.4s\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "sqadd v24.4s, v24.4s, v17.4s\n"
- "and v19.16b, v22.16b, v26.16b\n"
- "and v18.16b, v21.16b, v26.16b\n"
- "and v17.16b, v20.16b, v26.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ ".inst 0x6e849636 // udot v22.4s, v17.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x6e899614 // udot v20.4s, v16.16b, v9.16b\n"
+ ".inst 0x6e849613 // udot v19.4s, v16.16b, v4.16b\n"
+ ".inst 0x6e8495fb // udot v27.4s, v15.16b, v4.16b\n"
+ ".inst 0x6e869635 // udot v21.4s, v17.16b, v6.16b\n"
+ ".inst 0x6e869656 // udot v22.4s, v18.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e849634 // udot v20.4s, v17.16b, v4.16b\n"
+ ".inst 0x6e869633 // udot v19.4s, v17.16b, v6.16b\n"
+ ".inst 0x6e8695fb // udot v27.4s, v15.16b, v6.16b\n"
+ ".inst 0x6e999655 // udot v21.4s, v18.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "mls v22.4s, v5.4s, v24.4s\n"
+ ".inst 0x6e869654 // udot v20.4s, v18.16b, v6.16b\n"
+ ".inst 0x6e999653 // udot v19.4s, v18.16b, v25.16b\n"
+ "mov v17.16b, v27.16b\n .inst 0x6e9995f1 // udot v17.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995fb // udot v27.4s, v15.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v26.4s\n"
+ "mls v21.4s, v14.4s, v24.4s\n"
+ "and v16.16b, v22.16b, v23.16b\n"
+ "mls v20.4s, v27.4s, v24.4s\n"
+ "mls v19.4s, v17.4s, v24.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v26.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v26.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v18.16b, v21.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v21.4s, v21.4s, v18.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
"sqadd v20.4s, v20.4s, v17.4s\n"
- "srshl v24.4s, v24.4s, v26.4s\n"
- "srshl v22.4s, v22.4s, v26.4s\n"
- "srshl v21.4s, v21.4s, v26.4s\n"
- "srshl v20.4s, v20.4s, v26.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v22.4s, v22.4s, v14.4s\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v13.4s\n"
- "smax v21.4s, v21.4s, v13.4s\n"
- "smax v20.4s, v20.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v11.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "smax v22.4s, v22.4s, v7.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
"smin v22.4s, v22.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v19.4s, v19.4s, v12.4s\n"
+ "smax v21.4s, v21.4s, v7.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smax v20.4s, v20.4s, v7.4s\n"
+ "smax v19.4s, v19.4s, v7.4s\n"
"smin v21.4s, v21.4s, v11.4s\n"
- "smin v20.4s, v20.4s, v11.4s\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "smin v19.4s, v19.4s, v11.4s\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x11, x12]\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x25, x27]\n"
- "uzp1 v22.16b, v22.16b, v22.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "str s22, [x24, x27]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s21, [x23, x27]\n"
- "str s20, [x22, x27]\n"
- "add x27, x27, #0x4\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x10, x12]\n"
+ "str s21, [x9, x12]\n"
+ "str s19, [x28, x12]\n"
+ "add x12, x12, #0x4\n"
"beq 35f\n"
"3:" // Oddments
"and x20, %x[n_channels], #0xf\n"
- "add x15, x15, x28\n"
- "add x14, x14, x28\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
- "add x10, x10, x28\n"
- "add x9, x9, x28\n"
- "add x26, x26, x28\n"
- "add x21, x21, x28\n"
+ "add x15, x15, x13\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"tbz %x[n_channels], #3, 7f\n"
- "ldr d15, [x15], #0x8\n"
- "ldr d25, [x14], #0x8\n"
- "ldr d7, [x13], #0x8\n"
- "ldr d8, [x12], #0x8\n"
- "ldr d26, [x10], #0x8\n"
- "ldr d23, [x9], #0x8\n"
- "ldr d3, [x26], #0x8\n"
- "ldr d10, [x21], #0x8\n"
+ "ldr d13, [x15], #0x8\n"
+ "ldr d2, [x14], #0x8\n"
+ "ldr d0, [x27], #0x8\n"
+ "ldr d9, [x26], #0x8\n"
+ "ldr d1, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
"tbz %x[n_channels], #2, 5f\n"
- "ld1 { v15.s }[2], [x15], #0x4\n"
- "ld1 { v25.s }[2], [x14], #0x4\n"
- "ld1 { v7.s }[2], [x13], #0x4\n"
- "ld1 { v8.s }[2], [x12], #0x4\n"
- "ld1 { v26.s }[2], [x10], #0x4\n"
- "ld1 { v23.s }[2], [x9], #0x4\n"
- "ld1 { v3.s }[2], [x26], #0x4\n"
- "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x15], #0x4\n"
+ "ld1 { v2.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x27], #0x4\n"
+ "ld1 { v9.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v28.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #1, 4f\n"
- "ld1 { v15.h }[6], [x15], #0x2\n"
- "ld1 { v25.h }[6], [x14], #0x2\n"
- "ld1 { v7.h }[6], [x13], #0x2\n"
- "ld1 { v8.h }[6], [x12], #0x2\n"
- "ld1 { v26.h }[6], [x10], #0x2\n"
- "ld1 { v23.h }[6], [x9], #0x2\n"
- "ld1 { v3.h }[6], [x26], #0x2\n"
- "ld1 { v10.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x15], #0x2\n"
+ "ld1 { v2.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x27], #0x2\n"
+ "ld1 { v9.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v28.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[14], [x15], #0x1\n"
- "ld1 { v25.b }[14], [x14], #0x1\n"
- "ld1 { v7.b }[14], [x13], #0x1\n"
- "ld1 { v8.b }[14], [x12], #0x1\n"
- "ld1 { v26.b }[14], [x10], #0x1\n"
- "ld1 { v23.b }[14], [x9], #0x1\n"
- "ld1 { v3.b }[14], [x26], #0x1\n"
- "ld1 { v10.b }[14], [x21], #0x1\n"
+ "ld1 { v13.b }[14], [x15], #0x1\n"
+ "ld1 { v2.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x27], #0x1\n"
+ "ld1 { v9.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x25], #0x1\n"
+ "ld1 { v21.b }[14], [x24], #0x1\n"
+ "ld1 { v28.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
"b 11f\n"
"4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[12], [x15], #0x1\n"
- "ld1 { v25.b }[12], [x14], #0x1\n"
- "ld1 { v7.b }[12], [x13], #0x1\n"
- "ld1 { v8.b }[12], [x12], #0x1\n"
- "ld1 { v26.b }[12], [x10], #0x1\n"
- "ld1 { v23.b }[12], [x9], #0x1\n"
- "ld1 { v3.b }[12], [x26], #0x1\n"
- "ld1 { v10.b }[12], [x21], #0x1\n"
+ "ld1 { v13.b }[12], [x15], #0x1\n"
+ "ld1 { v2.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x27], #0x1\n"
+ "ld1 { v9.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x25], #0x1\n"
+ "ld1 { v21.b }[12], [x24], #0x1\n"
+ "ld1 { v28.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
"b 11f\n"
"5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 6f\n"
- "ld1 { v15.h }[4], [x15], #0x2\n"
- "ld1 { v25.h }[4], [x14], #0x2\n"
- "ld1 { v7.h }[4], [x13], #0x2\n"
- "ld1 { v8.h }[4], [x12], #0x2\n"
- "ld1 { v26.h }[4], [x10], #0x2\n"
- "ld1 { v23.h }[4], [x9], #0x2\n"
- "ld1 { v3.h }[4], [x26], #0x2\n"
- "ld1 { v10.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x15], #0x2\n"
+ "ld1 { v2.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x27], #0x2\n"
+ "ld1 { v9.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v28.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[10], [x15], #0x1\n"
- "ld1 { v25.b }[10], [x14], #0x1\n"
- "ld1 { v7.b }[10], [x13], #0x1\n"
- "ld1 { v8.b }[10], [x12], #0x1\n"
- "ld1 { v26.b }[10], [x10], #0x1\n"
- "ld1 { v23.b }[10], [x9], #0x1\n"
- "ld1 { v3.b }[10], [x26], #0x1\n"
- "ld1 { v10.b }[10], [x21], #0x1\n"
+ "ld1 { v13.b }[10], [x15], #0x1\n"
+ "ld1 { v2.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x27], #0x1\n"
+ "ld1 { v9.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x25], #0x1\n"
+ "ld1 { v21.b }[10], [x24], #0x1\n"
+ "ld1 { v28.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
"b 11f\n"
"6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[8], [x15], #0x1\n"
- "ld1 { v25.b }[8], [x14], #0x1\n"
- "ld1 { v7.b }[8], [x13], #0x1\n"
- "ld1 { v8.b }[8], [x12], #0x1\n"
- "ld1 { v26.b }[8], [x10], #0x1\n"
- "ld1 { v23.b }[8], [x9], #0x1\n"
- "ld1 { v3.b }[8], [x26], #0x1\n"
- "ld1 { v10.b }[8], [x21], #0x1\n"
+ "ld1 { v13.b }[8], [x15], #0x1\n"
+ "ld1 { v2.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x27], #0x1\n"
+ "ld1 { v9.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x25], #0x1\n"
+ "ld1 { v21.b }[8], [x24], #0x1\n"
+ "ld1 { v28.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
"b 11f\n"
"7:" // Oddments: Load (A): Bit 3: Unset
"tbz %x[n_channels], #2, 9f\n"
- "ldr s15, [x15], #0x4\n"
- "ldr s25, [x14], #0x4\n"
- "ldr s7, [x13], #0x4\n"
- "ldr s8, [x12], #0x4\n"
- "ldr s26, [x10], #0x4\n"
- "ldr s23, [x9], #0x4\n"
- "ldr s3, [x26], #0x4\n"
- "ldr s10, [x21], #0x4\n"
+ "ldr s13, [x15], #0x4\n"
+ "ldr s2, [x14], #0x4\n"
+ "ldr s0, [x27], #0x4\n"
+ "ldr s9, [x26], #0x4\n"
+ "ldr s1, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s28, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
"tbz %x[n_channels], #1, 8f\n"
- "ld1 { v15.h }[2], [x15], #0x2\n"
- "ld1 { v25.h }[2], [x14], #0x2\n"
- "ld1 { v7.h }[2], [x13], #0x2\n"
- "ld1 { v8.h }[2], [x12], #0x2\n"
- "ld1 { v26.h }[2], [x10], #0x2\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "ld1 { v3.h }[2], [x26], #0x2\n"
- "ld1 { v10.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x15], #0x2\n"
+ "ld1 { v2.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[6], [x15], #0x1\n"
- "ld1 { v25.b }[6], [x14], #0x1\n"
- "ld1 { v7.b }[6], [x13], #0x1\n"
- "ld1 { v8.b }[6], [x12], #0x1\n"
- "ld1 { v26.b }[6], [x10], #0x1\n"
- "ld1 { v23.b }[6], [x9], #0x1\n"
- "ld1 { v3.b }[6], [x26], #0x1\n"
- "ld1 { v10.b }[6], [x21], #0x1\n"
+ "ld1 { v13.b }[6], [x15], #0x1\n"
+ "ld1 { v2.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x27], #0x1\n"
+ "ld1 { v9.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x25], #0x1\n"
+ "ld1 { v21.b }[6], [x24], #0x1\n"
+ "ld1 { v28.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
"b 11f\n"
"8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[4], [x15], #0x1\n"
- "ld1 { v25.b }[4], [x14], #0x1\n"
- "ld1 { v7.b }[4], [x13], #0x1\n"
- "ld1 { v8.b }[4], [x12], #0x1\n"
- "ld1 { v26.b }[4], [x10], #0x1\n"
- "ld1 { v23.b }[4], [x9], #0x1\n"
- "ld1 { v3.b }[4], [x26], #0x1\n"
- "ld1 { v10.b }[4], [x21], #0x1\n"
+ "ld1 { v13.b }[4], [x15], #0x1\n"
+ "ld1 { v2.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x27], #0x1\n"
+ "ld1 { v9.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x25], #0x1\n"
+ "ld1 { v21.b }[4], [x24], #0x1\n"
+ "ld1 { v28.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
"b 11f\n"
"9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 10f\n"
- "ldr h15, [x15], #0x2\n"
- "ldr h25, [x14], #0x2\n"
- "ldr h7, [x13], #0x2\n"
- "ldr h8, [x12], #0x2\n"
- "ldr h26, [x10], #0x2\n"
- "ldr h23, [x9], #0x2\n"
- "ldr h3, [x26], #0x2\n"
- "ldr h10, [x21], #0x2\n"
+ "ldr h13, [x15], #0x2\n"
+ "ldr h2, [x14], #0x2\n"
+ "ldr h0, [x27], #0x2\n"
+ "ldr h9, [x26], #0x2\n"
+ "ldr h1, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h28, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
"tbz %x[n_channels], #0, 11f\n"
- "ld1 { v15.b }[2], [x15], #0x1\n"
- "ld1 { v25.b }[2], [x14], #0x1\n"
- "ld1 { v7.b }[2], [x13], #0x1\n"
- "ld1 { v8.b }[2], [x12], #0x1\n"
- "ld1 { v26.b }[2], [x10], #0x1\n"
- "ld1 { v23.b }[2], [x9], #0x1\n"
- "ld1 { v3.b }[2], [x26], #0x1\n"
- "ld1 { v10.b }[2], [x21], #0x1\n"
+ "ld1 { v13.b }[2], [x15], #0x1\n"
+ "ld1 { v2.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x27], #0x1\n"
+ "ld1 { v9.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x25], #0x1\n"
+ "ld1 { v21.b }[2], [x24], #0x1\n"
+ "ld1 { v28.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
"b 11f\n"
"10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b15, [x15], #0x1\n"
- "ldr b25, [x14], #0x1\n"
- "ldr b7, [x13], #0x1\n"
- "ldr b8, [x12], #0x1\n"
- "ldr b26, [x10], #0x1\n"
- "ldr b23, [x9], #0x1\n"
- "ldr b3, [x26], #0x1\n"
- "ldr b10, [x21], #0x1\n"
+ "ldr b13, [x15], #0x1\n"
+ "ldr b2, [x14], #0x1\n"
+ "ldr b0, [x27], #0x1\n"
+ "ldr b9, [x26], #0x1\n"
+ "ldr b1, [x25], #0x1\n"
+ "ldr b21, [x24], #0x1\n"
+ "ldr b28, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
"11:" // Oddments: Load (A): Bit 3: End
"ldp x15, x14, [%x[inptrs], #0x40]\n"
- "ldp x13, x12, [%x[inptrs], #0x50]\n"
- "add x15, x15, x28\n"
- "add x14, x14, x28\n"
- "ldp x10, x9, [%x[inptrs], #0x60]\n"
- "ldp x26, x21, [%x[inptrs], #0x70]\n"
- "add x13, x13, x28\n"
- "add x12, x12, x28\n"
- "add x10, x10, x28\n"
- "add x9, x9, x28\n"
- "add x26, x26, x28\n"
- "add x21, x21, x28\n"
+ "ldp x27, x26, [%x[inptrs], #0x50]\n"
+ "ldp x25, x24, [%x[inptrs], #0x60]\n"
+ "ldp x23, x22, [%x[inptrs], #0x70]\n"
+ "add x15, x15, x13\n"
+ "add x14, x14, x13\n"
+ "add x27, x27, x13\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "add x24, x24, x13\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
"tbz %x[n_channels], #3, 15f\n"
- "ldr d22, [x15], #0x8\n"
- "ldr d19, [x14], #0x8\n"
- "ldr d0, [x13], #0x8\n"
- "ldr d5, [x12], #0x8\n"
- "ldr d27, [x10], #0x8\n"
- "ldr d24, [x9], #0x8\n"
- "ldr d2, [x26], #0x8\n"
- "ldr d9, [x21], #0x8\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d29, [x14], #0x8\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d6, [x26], #0x8\n"
+ "ldr d5, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
"tbz %x[n_channels], #2, 13f\n"
- "ld1 { v22.s }[2], [x15], #0x4\n"
- "ld1 { v19.s }[2], [x14], #0x4\n"
- "ld1 { v0.s }[2], [x13], #0x4\n"
- "ld1 { v5.s }[2], [x12], #0x4\n"
- "ld1 { v27.s }[2], [x10], #0x4\n"
- "ld1 { v24.s }[2], [x9], #0x4\n"
- "ld1 { v2.s }[2], [x26], #0x4\n"
- "ld1 { v9.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "ld1 { v29.s }[2], [x14], #0x4\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v6.s }[2], [x26], #0x4\n"
+ "ld1 { v5.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v14.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
"tbz %x[n_channels], #1, 12f\n"
- "ld1 { v22.h }[6], [x15], #0x2\n"
- "ld1 { v19.h }[6], [x14], #0x2\n"
- "ld1 { v0.h }[6], [x13], #0x2\n"
- "ld1 { v5.h }[6], [x12], #0x2\n"
- "ld1 { v27.h }[6], [x10], #0x2\n"
- "ld1 { v24.h }[6], [x9], #0x2\n"
- "ld1 { v2.h }[6], [x26], #0x2\n"
- "ld1 { v9.h }[6], [x21], #0x2\n"
+ "ld1 { v10.h }[6], [x15], #0x2\n"
+ "ld1 { v29.h }[6], [x14], #0x2\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
+ "ld1 { v6.h }[6], [x26], #0x2\n"
+ "ld1 { v5.h }[6], [x25], #0x2\n"
+ "ld1 { v18.h }[6], [x24], #0x2\n"
+ "ld1 { v14.h }[6], [x23], #0x2\n"
+ "ld1 { v25.h }[6], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[14], [x15], #0x1\n"
- "ld1 { v19.b }[14], [x14], #0x1\n"
- "ld1 { v0.b }[14], [x13], #0x1\n"
- "ld1 { v5.b }[14], [x12], #0x1\n"
- "ld1 { v27.b }[14], [x10], #0x1\n"
- "ld1 { v24.b }[14], [x9], #0x1\n"
- "ld1 { v2.b }[14], [x26], #0x1\n"
- "ld1 { v9.b }[14], [x21], #0x1\n"
+ "ld1 { v10.b }[14], [x15], #0x1\n"
+ "ld1 { v29.b }[14], [x14], #0x1\n"
+ "ld1 { v27.b }[14], [x27], #0x1\n"
+ "ld1 { v6.b }[14], [x26], #0x1\n"
+ "ld1 { v5.b }[14], [x25], #0x1\n"
+ "ld1 { v18.b }[14], [x24], #0x1\n"
+ "ld1 { v14.b }[14], [x23], #0x1\n"
+ "ld1 { v25.b }[14], [x22], #0x1\n"
"b 19f\n"
"12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[12], [x15], #0x1\n"
- "ld1 { v19.b }[12], [x14], #0x1\n"
- "ld1 { v0.b }[12], [x13], #0x1\n"
- "ld1 { v5.b }[12], [x12], #0x1\n"
- "ld1 { v27.b }[12], [x10], #0x1\n"
- "ld1 { v24.b }[12], [x9], #0x1\n"
- "ld1 { v2.b }[12], [x26], #0x1\n"
- "ld1 { v9.b }[12], [x21], #0x1\n"
+ "ld1 { v10.b }[12], [x15], #0x1\n"
+ "ld1 { v29.b }[12], [x14], #0x1\n"
+ "ld1 { v27.b }[12], [x27], #0x1\n"
+ "ld1 { v6.b }[12], [x26], #0x1\n"
+ "ld1 { v5.b }[12], [x25], #0x1\n"
+ "ld1 { v18.b }[12], [x24], #0x1\n"
+ "ld1 { v14.b }[12], [x23], #0x1\n"
+ "ld1 { v25.b }[12], [x22], #0x1\n"
"b 19f\n"
"13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 14f\n"
- "ld1 { v22.h }[4], [x15], #0x2\n"
- "ld1 { v19.h }[4], [x14], #0x2\n"
- "ld1 { v0.h }[4], [x13], #0x2\n"
- "ld1 { v5.h }[4], [x12], #0x2\n"
- "ld1 { v27.h }[4], [x10], #0x2\n"
- "ld1 { v24.h }[4], [x9], #0x2\n"
- "ld1 { v2.h }[4], [x26], #0x2\n"
- "ld1 { v9.h }[4], [x21], #0x2\n"
+ "ld1 { v10.h }[4], [x15], #0x2\n"
+ "ld1 { v29.h }[4], [x14], #0x2\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
+ "ld1 { v6.h }[4], [x26], #0x2\n"
+ "ld1 { v5.h }[4], [x25], #0x2\n"
+ "ld1 { v18.h }[4], [x24], #0x2\n"
+ "ld1 { v14.h }[4], [x23], #0x2\n"
+ "ld1 { v25.h }[4], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[10], [x15], #0x1\n"
- "ld1 { v19.b }[10], [x14], #0x1\n"
- "ld1 { v0.b }[10], [x13], #0x1\n"
- "ld1 { v5.b }[10], [x12], #0x1\n"
- "ld1 { v27.b }[10], [x10], #0x1\n"
- "ld1 { v24.b }[10], [x9], #0x1\n"
- "ld1 { v2.b }[10], [x26], #0x1\n"
- "ld1 { v9.b }[10], [x21], #0x1\n"
+ "ld1 { v10.b }[10], [x15], #0x1\n"
+ "ld1 { v29.b }[10], [x14], #0x1\n"
+ "ld1 { v27.b }[10], [x27], #0x1\n"
+ "ld1 { v6.b }[10], [x26], #0x1\n"
+ "ld1 { v5.b }[10], [x25], #0x1\n"
+ "ld1 { v18.b }[10], [x24], #0x1\n"
+ "ld1 { v14.b }[10], [x23], #0x1\n"
+ "ld1 { v25.b }[10], [x22], #0x1\n"
"b 19f\n"
"14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[8], [x15], #0x1\n"
- "ld1 { v19.b }[8], [x14], #0x1\n"
- "ld1 { v0.b }[8], [x13], #0x1\n"
- "ld1 { v5.b }[8], [x12], #0x1\n"
- "ld1 { v27.b }[8], [x10], #0x1\n"
- "ld1 { v24.b }[8], [x9], #0x1\n"
- "ld1 { v2.b }[8], [x26], #0x1\n"
- "ld1 { v9.b }[8], [x21], #0x1\n"
+ "ld1 { v10.b }[8], [x15], #0x1\n"
+ "ld1 { v29.b }[8], [x14], #0x1\n"
+ "ld1 { v27.b }[8], [x27], #0x1\n"
+ "ld1 { v6.b }[8], [x26], #0x1\n"
+ "ld1 { v5.b }[8], [x25], #0x1\n"
+ "ld1 { v18.b }[8], [x24], #0x1\n"
+ "ld1 { v14.b }[8], [x23], #0x1\n"
+ "ld1 { v25.b }[8], [x22], #0x1\n"
"b 19f\n"
"15:" // Oddments: Load (B): Bit 3: Unset
"tbz %x[n_channels], #2, 17f\n"
- "ldr s22, [x15], #0x4\n"
- "ldr s19, [x14], #0x4\n"
- "ldr s0, [x13], #0x4\n"
- "ldr s5, [x12], #0x4\n"
- "ldr s27, [x10], #0x4\n"
- "ldr s24, [x9], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s9, [x21], #0x4\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s29, [x14], #0x4\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s6, [x26], #0x4\n"
+ "ldr s5, [x25], #0x4\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s14, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v22.h }[2], [x15], #0x2\n"
- "ld1 { v19.h }[2], [x14], #0x2\n"
- "ld1 { v0.h }[2], [x13], #0x2\n"
- "ld1 { v5.h }[2], [x12], #0x2\n"
- "ld1 { v27.h }[2], [x10], #0x2\n"
- "ld1 { v24.h }[2], [x9], #0x2\n"
- "ld1 { v2.h }[2], [x26], #0x2\n"
- "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x15], #0x2\n"
+ "ld1 { v29.h }[2], [x14], #0x2\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
+ "ld1 { v6.h }[2], [x26], #0x2\n"
+ "ld1 { v5.h }[2], [x25], #0x2\n"
+ "ld1 { v18.h }[2], [x24], #0x2\n"
+ "ld1 { v14.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[6], [x15], #0x1\n"
- "ld1 { v19.b }[6], [x14], #0x1\n"
- "ld1 { v0.b }[6], [x13], #0x1\n"
- "ld1 { v5.b }[6], [x12], #0x1\n"
- "ld1 { v27.b }[6], [x10], #0x1\n"
- "ld1 { v24.b }[6], [x9], #0x1\n"
- "ld1 { v2.b }[6], [x26], #0x1\n"
- "ld1 { v9.b }[6], [x21], #0x1\n"
+ "ld1 { v10.b }[6], [x15], #0x1\n"
+ "ld1 { v29.b }[6], [x14], #0x1\n"
+ "ld1 { v27.b }[6], [x27], #0x1\n"
+ "ld1 { v6.b }[6], [x26], #0x1\n"
+ "ld1 { v5.b }[6], [x25], #0x1\n"
+ "ld1 { v18.b }[6], [x24], #0x1\n"
+ "ld1 { v14.b }[6], [x23], #0x1\n"
+ "ld1 { v25.b }[6], [x22], #0x1\n"
"b 19f\n"
"16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[4], [x15], #0x1\n"
- "ld1 { v19.b }[4], [x14], #0x1\n"
- "ld1 { v0.b }[4], [x13], #0x1\n"
- "ld1 { v5.b }[4], [x12], #0x1\n"
- "ld1 { v27.b }[4], [x10], #0x1\n"
- "ld1 { v24.b }[4], [x9], #0x1\n"
- "ld1 { v2.b }[4], [x26], #0x1\n"
- "ld1 { v9.b }[4], [x21], #0x1\n"
+ "ld1 { v10.b }[4], [x15], #0x1\n"
+ "ld1 { v29.b }[4], [x14], #0x1\n"
+ "ld1 { v27.b }[4], [x27], #0x1\n"
+ "ld1 { v6.b }[4], [x26], #0x1\n"
+ "ld1 { v5.b }[4], [x25], #0x1\n"
+ "ld1 { v18.b }[4], [x24], #0x1\n"
+ "ld1 { v14.b }[4], [x23], #0x1\n"
+ "ld1 { v25.b }[4], [x22], #0x1\n"
"b 19f\n"
"17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr h22, [x15], #0x2\n"
- "ldr h19, [x14], #0x2\n"
- "ldr h0, [x13], #0x2\n"
- "ldr h5, [x12], #0x2\n"
- "ldr h27, [x10], #0x2\n"
- "ldr h24, [x9], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h9, [x21], #0x2\n"
+ "ldr h10, [x15], #0x2\n"
+ "ldr h29, [x14], #0x2\n"
+ "ldr h27, [x27], #0x2\n"
+ "ldr h6, [x26], #0x2\n"
+ "ldr h5, [x25], #0x2\n"
+ "ldr h18, [x24], #0x2\n"
+ "ldr h14, [x23], #0x2\n"
+ "ldr h25, [x22], #0x2\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v22.b }[2], [x15], #0x1\n"
- "ld1 { v19.b }[2], [x14], #0x1\n"
- "ld1 { v0.b }[2], [x13], #0x1\n"
- "ld1 { v5.b }[2], [x12], #0x1\n"
- "ld1 { v27.b }[2], [x10], #0x1\n"
- "ld1 { v24.b }[2], [x9], #0x1\n"
- "ld1 { v2.b }[2], [x26], #0x1\n"
- "ld1 { v9.b }[2], [x21], #0x1\n"
+ "ld1 { v10.b }[2], [x15], #0x1\n"
+ "ld1 { v29.b }[2], [x14], #0x1\n"
+ "ld1 { v27.b }[2], [x27], #0x1\n"
+ "ld1 { v6.b }[2], [x26], #0x1\n"
+ "ld1 { v5.b }[2], [x25], #0x1\n"
+ "ld1 { v18.b }[2], [x24], #0x1\n"
+ "ld1 { v14.b }[2], [x23], #0x1\n"
+ "ld1 { v25.b }[2], [x22], #0x1\n"
"b 19f\n"
"18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
- "ldr b22, [x15], #0x1\n"
- "ldr b19, [x14], #0x1\n"
- "ldr b0, [x13], #0x1\n"
- "ldr b5, [x12], #0x1\n"
- "ldr b27, [x10], #0x1\n"
- "ldr b24, [x9], #0x1\n"
- "ldr b2, [x26], #0x1\n"
- "ldr b9, [x21], #0x1\n"
+ "ldr b10, [x15], #0x1\n"
+ "ldr b29, [x14], #0x1\n"
+ "ldr b27, [x27], #0x1\n"
+ "ldr b6, [x26], #0x1\n"
+ "ldr b5, [x25], #0x1\n"
+ "ldr b18, [x24], #0x1\n"
+ "ldr b14, [x23], #0x1\n"
+ "ldr b25, [x22], #0x1\n"
"19:" // Oddments: Load (B): Bit 3: End
"ldr q20, [%x[params], #0x10]\n"
- "ldr q6, [%x[params], #0x20]\n"
- "zip2 v1.16b, v26.16b, v3.16b\n"
- "zip1 v26.16b, v26.16b, v3.16b\n"
- "ldr q4, [%x[params], #0x30]\n"
- "zip1 v18.16b, v23.16b, v10.16b\n"
- "zip2 v30.16b, v15.16b, v7.16b\n"
+ "ldr q17, [%x[params], #0x20]\n"
+ "zip2 v26.16b, v1.16b, v28.16b\n"
+ "zip1 v1.16b, v1.16b, v28.16b\n"
+ "ldr q30, [%x[params], #0x30]\n"
+ "zip1 v19.16b, v21.16b, v4.16b\n"
+ "zip2 v23.16b, v13.16b, v0.16b\n"
"cmp x20, #0x4\n"
- "zip1 v15.16b, v15.16b, v7.16b\n"
- "zip1 v29.16b, v25.16b, v8.16b\n"
- "zip2 v8.16b, v25.16b, v8.16b\n"
- "zip2 v10.16b, v23.16b, v10.16b\n"
- "zip2 v23.16b, v26.16b, v18.16b\n"
- "zip1 v26.16b, v26.16b, v18.16b\n"
- "zip2 v28.16b, v22.16b, v0.16b\n"
- "zip1 v22.16b, v22.16b, v0.16b\n"
- "zip1 v21.16b, v19.16b, v5.16b\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6e9a9591 // udot v17.4s, v12.16b, v26.16b\n"
- "zip2 v25.16b, v15.16b, v29.16b\n"
- "zip1 v15.16b, v15.16b, v29.16b\n"
- "zip1 v7.16b, v30.16b, v8.16b\n"
- "zip2 v8.16b, v30.16b, v8.16b\n"
+ "zip1 v13.16b, v13.16b, v0.16b\n"
+ "zip1 v22.16b, v2.16b, v9.16b\n"
+ "zip2 v9.16b, v2.16b, v9.16b\n"
+ "zip2 v4.16b, v21.16b, v4.16b\n"
+ "zip2 v21.16b, v1.16b, v19.16b\n"
+ "zip1 v1.16b, v1.16b, v19.16b\n"
+ "zip2 v16.16b, v10.16b, v27.16b\n"
+ "zip1 v10.16b, v10.16b, v27.16b\n"
+ "zip1 v19.16b, v29.16b, v6.16b\n"
+ "movi v8.4s, #0x0\n"
+ "zip2 v2.16b, v13.16b, v22.16b\n"
+ "zip1 v13.16b, v13.16b, v22.16b\n"
+ "zip1 v0.16b, v23.16b, v9.16b\n"
+ "zip2 v9.16b, v23.16b, v9.16b\n"
"ldr q31, [%x[params], #0x0]\n"
- "zip2 v5.16b, v19.16b, v5.16b\n"
- "zip2 v30.16b, v27.16b, v2.16b\n"
- "zip1 v27.16b, v27.16b, v2.16b\n"
- "zip1 v18.16b, v24.16b, v9.16b\n"
- "zip2 v9.16b, v24.16b, v9.16b\n"
- "zip2 v19.16b, v22.16b, v21.16b\n"
- "zip1 v22.16b, v22.16b, v21.16b\n"
- "zip1 v3.16b, v1.16b, v10.16b\n"
- ".inst 0x6e969591 // udot v17.4s, v12.16b, v22.16b\n"
- "zip2 v10.16b, v1.16b, v10.16b\n"
- "zip1 v0.16b, v28.16b, v5.16b\n"
- "zip2 v5.16b, v28.16b, v5.16b\n"
- "zip2 v24.16b, v27.16b, v18.16b\n"
- "zip1 v27.16b, v27.16b, v18.16b\n"
- "zip1 v2.16b, v30.16b, v9.16b\n"
- "mov v18.16b, v17.16b\n .inst 0x6e9b9592 // udot v18.4s, v12.16b, v27.16b\n"
- "zip2 v9.16b, v30.16b, v9.16b\n"
- "mov v30.16b, v31.16b\n"
- ".inst 0x6e8f9591 // udot v17.4s, v12.16b, v15.16b\n"
- "mov v29.16b, v31.16b\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x6e8f969f // udot v31.4s, v20.16b, v15.16b\n"
- ".inst 0x6e9a969d // udot v29.4s, v20.16b, v26.16b\n"
- ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "movi v1.4s, #0x0\n"
- "ext v15.16b, v15.16b, v15.16b, #0x1\n"
- ".inst 0x6e9a9581 // udot v1.4s, v12.16b, v26.16b\n"
- ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
- ".inst 0x6e96949f // udot v31.4s, v4.16b, v22.16b\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- ".inst 0x6e8f969e // udot v30.4s, v20.16b, v15.16b\n"
- ".inst 0x6e9a969c // udot v28.4s, v20.16b, v26.16b\n"
- "mls v31.4s, v17.4s, v16.4s\n"
- ".inst 0x6e969581 // udot v1.4s, v12.16b, v22.16b\n"
- ".inst 0x6e9b949d // udot v29.4s, v4.16b, v27.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
- "ldr q21, [%x[params], #0x50]\n"
- ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mov v20.16b, v1.16b\n .inst 0x6e9b9594 // udot v20.4s, v12.16b, v27.16b\n"
- ".inst 0x6e8f9581 // udot v1.4s, v12.16b, v15.16b\n"
- "ldr q18, [%x[params], #0x40]\n"
- "sqrdmulh v31.4s, v31.4s, v18.4s\n"
- ".inst 0x6e96949e // udot v30.4s, v4.16b, v22.16b\n"
- ".inst 0x6e9b949c // udot v28.4s, v4.16b, v27.16b\n"
- "mls v30.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e8195e8 // udot v8.4s, v15.16b, v1.16b\n"
+ "zip2 v6.16b, v29.16b, v6.16b\n"
+ "zip2 v22.16b, v5.16b, v14.16b\n"
+ "zip1 v5.16b, v5.16b, v14.16b\n"
+ "zip1 v3.16b, v18.16b, v25.16b\n"
+ "zip2 v25.16b, v18.16b, v25.16b\n"
+ "zip2 v29.16b, v10.16b, v19.16b\n"
+ "zip1 v10.16b, v10.16b, v19.16b\n"
+ "zip1 v28.16b, v26.16b, v4.16b\n"
+ "zip2 v4.16b, v26.16b, v4.16b\n"
+ "zip1 v27.16b, v16.16b, v6.16b\n"
+ "zip2 v6.16b, v16.16b, v6.16b\n"
+ "zip2 v18.16b, v5.16b, v3.16b\n"
+ "zip1 v5.16b, v5.16b, v3.16b\n"
+ "zip1 v14.16b, v22.16b, v25.16b\n"
+ ".inst 0x6e8a95e8 // udot v8.4s, v15.16b, v10.16b\n"
+ "zip2 v25.16b, v22.16b, v25.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x6e8d969f // udot v31.4s, v20.16b, v13.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e819683 // udot v3.4s, v20.16b, v1.16b\n"
+ "mov v16.16b, v8.16b\n .inst 0x6e8595f0 // udot v16.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95e8 // udot v8.4s, v15.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6e81963f // udot v31.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x6e8a9623 // udot v3.4s, v17.16b, v10.16b\n"
+ ".inst 0x6e8d969a // udot v26.4s, v20.16b, v13.16b\n"
+ ".inst 0x6e8195f6 // udot v22.4s, v15.16b, v1.16b\n"
+ ".inst 0x6e8a97df // udot v31.4s, v30.16b, v10.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e819697 // udot v23.4s, v20.16b, v1.16b\n"
+ ".inst 0x6e8597c3 // udot v3.4s, v30.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e81963a // udot v26.4s, v17.16b, v1.16b\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x6e8a95f6 // udot v22.4s, v15.16b, v10.16b\n"
+ "mls v31.4s, v8.4s, v24.4s\n"
+ ".inst 0x6e8a9637 // udot v23.4s, v17.16b, v10.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ "mov v19.16b, v22.16b\n .inst 0x6e8595f3 // udot v19.4s, v15.16b, v5.16b\n"
+ ".inst 0x6e8d95f6 // udot v22.4s, v15.16b, v13.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
"add %x[params], %x[params], #0x60\n"
- "mls v28.4s, v20.4s, v16.4s\n"
- "and v17.16b, v31.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v18.4s\n"
- "sqrdmulh v29.4s, v29.4s, v18.4s\n"
- "sqrdmulh v28.4s, v28.4s, v18.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v17.16b, v30.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
- "and v26.16b, v28.16b, v21.16b\n"
+ ".inst 0x6e8a97da // udot v26.4s, v30.16b, v10.16b\n"
+ ".inst 0x6e8597d7 // udot v23.4s, v30.16b, v5.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v26.4s, v22.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "mls v23.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "and v19.16b, v3.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v16.16b, v23.16b, v20.16b\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v26.4s\n"
- "srshl v31.4s, v31.4s, v21.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v29.4s, v29.4s, v21.4s\n"
- "srshl v28.4s, v28.4s, v21.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "sqadd v3.4s, v3.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v20.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 20f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 23f\n"
"20:" // Oddments: Unroll 0: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 21f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 22f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 22f\n"
"21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
"23:" // Oddments: Unroll 0: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q27, [%x[params], #0x10]\n"
- "movi v1.4s, #0x0\n"
- ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
- "ldr q26, [%x[params], #0x20]\n"
- "ldr q22, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q4, [%x[params], #0x40]\n"
- "ldr q21, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x6e99977f // udot v31.4s, v27.16b, v25.16b\n"
- ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
- ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
- "movi v20.4s, #0x0\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v8.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q20, [%x[params], #0x30]\n"
"cmp x20, #0x4\n"
- ".inst 0x6e97975f // udot v31.4s, v26.16b, v23.16b\n"
- "mov v18.16b, v1.16b\n .inst 0x6e989592 // udot v18.4s, v12.16b, v24.16b\n"
- "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x6e9595e8 // udot v8.4s, v15.16b, v21.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
- "ext v25.16b, v25.16b, v25.16b, #0x1\n"
- ".inst 0x6e99977e // udot v30.4s, v27.16b, v25.16b\n"
- ".inst 0x6e97977c // udot v28.4s, v27.16b, v23.16b\n"
- ".inst 0x6e979594 // udot v20.4s, v12.16b, v23.16b\n"
- ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
- ".inst 0x6e9396df // udot v31.4s, v22.16b, v19.16b\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6e97975e // udot v30.4s, v26.16b, v23.16b\n"
- ".inst 0x6e93975c // udot v28.4s, v26.16b, v19.16b\n"
- "mls v31.4s, v1.4s, v16.4s\n"
- ".inst 0x6e939594 // udot v20.4s, v12.16b, v19.16b\n"
- ".inst 0x6e9896dd // udot v29.4s, v22.16b, v24.16b\n"
- "ext v24.16b, v24.16b, v24.16b, #0x1\n"
- ".inst 0x6e9396de // udot v30.4s, v22.16b, v19.16b\n"
- ".inst 0x6e9896dc // udot v28.4s, v22.16b, v24.16b\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "mov v17.16b, v20.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
- ".inst 0x6e999594 // udot v20.4s, v12.16b, v25.16b\n"
- "mls v30.4s, v20.4s, v16.4s\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v21.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v30.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
- "and v17.16b, v28.16b, v21.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x6e8294bf // udot v31.4s, v5.16b, v2.16b\n"
+ ".inst 0x6e9594a3 // udot v3.4s, v5.16b, v21.16b\n"
+ ".inst 0x6e9d95e8 // udot v8.4s, v15.16b, v29.16b\n"
+ ".inst 0x6e9596df // udot v31.4s, v22.16b, v21.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6e9594b7 // udot v23.4s, v5.16b, v21.16b\n"
+ ".inst 0x6e9595fe // udot v30.4s, v15.16b, v21.16b\n"
+ ".inst 0x6e9d96c3 // udot v3.4s, v22.16b, v29.16b\n"
+ "mov v16.16b, v8.16b\n .inst 0x6e9295f0 // udot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295e8 // udot v8.4s, v15.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e9d969f // udot v31.4s, v20.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x6e8294ba // udot v26.4s, v5.16b, v2.16b\n"
+ ".inst 0x6e929683 // udot v3.4s, v20.16b, v18.16b\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ ".inst 0x6e9d96d7 // udot v23.4s, v22.16b, v29.16b\n"
+ ".inst 0x6e9d95fe // udot v30.4s, v15.16b, v29.16b\n"
+ "mls v31.4s, v8.4s, v24.4s\n"
+ ".inst 0x6e9596da // udot v26.4s, v22.16b, v21.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x6e929697 // udot v23.4s, v20.16b, v18.16b\n"
+ "mov v16.16b, v30.16b\n .inst 0x6e9295f0 // udot v16.4s, v15.16b, v18.16b\n"
+ ".inst 0x6e8295fe // udot v30.4s, v15.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e9d969a // udot v26.4s, v20.16b, v29.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v30.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v19.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "srshl v31.4s, v31.4s, v21.4s\n"
- "srshl v30.4s, v30.4s, v21.4s\n"
- "srshl v29.4s, v29.4s, v21.4s\n"
- "srshl v28.4s, v28.4s, v21.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 24f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 27f\n"
"24:" // Oddments: Unroll 1: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 25f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 26f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 26f\n"
"25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
"27:" // Oddments: Unroll 1: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q25, [%x[params], #0x10]\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x6e839598 // udot v24.4s, v12.16b, v3.16b\n"
- "ldr q23, [%x[params], #0x20]\n"
- "ldr q22, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q21, [%x[params], #0x40]\n"
- "ldr q20, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x6e87973f // udot v31.4s, v25.16b, v7.16b\n"
- ".inst 0x6e809598 // udot v24.4s, v12.16b, v0.16b\n"
- ".inst 0x6e83973d // udot v29.4s, v25.16b, v3.16b\n"
- "movi v19.4s, #0x0\n"
+ "ldr q29, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ "ldr q18, [%x[params], #0x30]\n"
"cmp x20, #0x4\n"
- ".inst 0x6e8396ff // udot v31.4s, v23.16b, v3.16b\n"
- "mov v18.16b, v24.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
- "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x6e9c95f6 // udot v22.4s, v15.16b, v28.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e879598 // udot v24.4s, v12.16b, v7.16b\n"
- "ext v7.16b, v7.16b, v7.16b, #0x1\n"
- ".inst 0x6e87973e // udot v30.4s, v25.16b, v7.16b\n"
- ".inst 0x6e83973c // udot v28.4s, v25.16b, v3.16b\n"
- ".inst 0x6e839593 // udot v19.4s, v12.16b, v3.16b\n"
- ".inst 0x6e8096fd // udot v29.4s, v23.16b, v0.16b\n"
- ".inst 0x6e8096df // udot v31.4s, v22.16b, v0.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x6e8097bf // udot v31.4s, v29.16b, v0.16b\n"
+ ".inst 0x6e9c97a3 // udot v3.4s, v29.16b, v28.16b\n"
+ ".inst 0x6e9b95f6 // udot v22.4s, v15.16b, v27.16b\n"
+ ".inst 0x6e9c969f // udot v31.4s, v20.16b, v28.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x6e9c97b7 // udot v23.4s, v29.16b, v28.16b\n"
+ ".inst 0x6e9c95f5 // udot v21.4s, v15.16b, v28.16b\n"
+ ".inst 0x6e9b9683 // udot v3.4s, v20.16b, v27.16b\n"
+ "mov v16.16b, v22.16b\n .inst 0x6e8e95f0 // udot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095f6 // udot v22.4s, v15.16b, v0.16b\n"
"ext v0.16b, v0.16b, v0.16b, #0x1\n"
- ".inst 0x6e8396fe // udot v30.4s, v23.16b, v3.16b\n"
- ".inst 0x6e8096fc // udot v28.4s, v23.16b, v0.16b\n"
- "mls v31.4s, v24.4s, v16.4s\n"
- ".inst 0x6e809593 // udot v19.4s, v12.16b, v0.16b\n"
- ".inst 0x6e8296dd // udot v29.4s, v22.16b, v2.16b\n"
- "ext v2.16b, v2.16b, v2.16b, #0x1\n"
- ".inst 0x6e8096de // udot v30.4s, v22.16b, v0.16b\n"
- ".inst 0x6e8296dc // udot v28.4s, v22.16b, v2.16b\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
- "mov v17.16b, v19.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
- ".inst 0x6e879593 // udot v19.4s, v12.16b, v7.16b\n"
- "mls v30.4s, v19.4s, v16.4s\n"
- "mls v29.4s, v18.4s, v16.4s\n"
- "mls v28.4s, v17.4s, v16.4s\n"
- "and v17.16b, v31.16b, v20.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqadd v31.4s, v31.4s, v17.4s\n"
- "and v19.16b, v30.16b, v20.16b\n"
- "and v18.16b, v29.16b, v20.16b\n"
- "and v17.16b, v28.16b, v20.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
+ ".inst 0x6e9b965f // udot v31.4s, v18.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e8097ba // udot v26.4s, v29.16b, v0.16b\n"
+ ".inst 0x6e8e9643 // udot v3.4s, v18.16b, v14.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ ".inst 0x6e9b9697 // udot v23.4s, v20.16b, v27.16b\n"
+ ".inst 0x6e9b95f5 // udot v21.4s, v15.16b, v27.16b\n"
+ "mls v31.4s, v22.4s, v24.4s\n"
+ ".inst 0x6e9c969a // udot v26.4s, v20.16b, v28.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x6e8e9657 // udot v23.4s, v18.16b, v14.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x6e8e95f0 // udot v16.4s, v15.16b, v14.16b\n"
+ ".inst 0x6e8095f5 // udot v21.4s, v15.16b, v0.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e9b965a // udot v26.4s, v18.16b, v27.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v19.4s\n"
- "sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "srshl v31.4s, v31.4s, v20.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
- "srshl v29.4s, v29.4s, v20.4s\n"
- "srshl v28.4s, v28.4s, v20.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"blt 28f\n"
- "str s31, [x25, x27]\n"
- "str s30, [x24, x27]\n"
- "str s29, [x23, x27]\n"
- "str s28, [x22, x27]\n"
+ "str s31, [x11, x12]\n"
+ "str s26, [x10, x12]\n"
+ "str s3, [x9, x12]\n"
+ "str s23, [x28, x12]\n"
"b 31f\n"
"28:" // Oddments: Unroll 2: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 29f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 30f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 30f\n"
"29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
"31:" // Oddments: Unroll 2: After oddment store
"subs x20, x20, #0x4\n"
- "add x27, x27, #0x4\n"
+ "add x12, x12, #0x4\n"
"ble 35f\n"
"ldr q31, [%x[params], #0x0]\n"
- "ldr q23, [%x[params], #0x10]\n"
+ "ldr q1, [%x[params], #0x10]\n"
"movi v22.4s, #0x0\n"
- ".inst 0x6e8a9596 // udot v22.4s, v12.16b, v10.16b\n"
- "ldr q21, [%x[params], #0x20]\n"
- "ldr q19, [%x[params], #0x30]\n"
- "mov v30.16b, v31.16b\n"
- "mov v29.16b, v31.16b\n"
- "ldr q20, [%x[params], #0x40]\n"
- "ldr q26, [%x[params], #0x50]\n"
- "mov v28.16b, v31.16b\n"
- ".inst 0x6e8896ff // udot v31.4s, v23.16b, v8.16b\n"
- ".inst 0x6e859596 // udot v22.4s, v12.16b, v5.16b\n"
- ".inst 0x6e8a96fd // udot v29.4s, v23.16b, v10.16b\n"
- "movi v18.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q19, [%x[params], #0x50]\n"
+ ".inst 0x6e8495f6 // udot v22.4s, v15.16b, v4.16b\n"
"add %x[params], %x[params], #0x60\n"
- ".inst 0x6e8a96bf // udot v31.4s, v21.16b, v10.16b\n"
- "mov v17.16b, v22.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
- "ext v10.16b, v10.16b, v10.16b, #0x1\n"
- ".inst 0x6e889596 // udot v22.4s, v12.16b, v8.16b\n"
- "ext v8.16b, v8.16b, v8.16b, #0x1\n"
- ".inst 0x6e8896fe // udot v30.4s, v23.16b, v8.16b\n"
- ".inst 0x6e8a96fc // udot v28.4s, v23.16b, v10.16b\n"
- ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
- ".inst 0x6e8596bd // udot v29.4s, v21.16b, v5.16b\n"
- ".inst 0x6e85967f // udot v31.4s, v19.16b, v5.16b\n"
- "ext v5.16b, v5.16b, v5.16b, #0x1\n"
- ".inst 0x6e8a96be // udot v30.4s, v21.16b, v10.16b\n"
- ".inst 0x6e8596bc // udot v28.4s, v21.16b, v5.16b\n"
- "mls v31.4s, v22.4s, v16.4s\n"
- ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
- ".inst 0x6e89967d // udot v29.4s, v19.16b, v9.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v3.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ ".inst 0x6e89943f // udot v31.4s, v1.16b, v9.16b\n"
+ ".inst 0x6e849423 // udot v3.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e8695f6 // udot v22.4s, v15.16b, v6.16b\n"
+ ".inst 0x6e84969f // udot v31.4s, v20.16b, v4.16b\n"
+ "ext v4.16b, v4.16b, v4.16b, #0x1\n"
+ ".inst 0x6e849437 // udot v23.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e8495f5 // udot v21.4s, v15.16b, v4.16b\n"
+ ".inst 0x6e869683 // udot v3.4s, v20.16b, v6.16b\n"
+ "mov v16.16b, v22.16b\n .inst 0x6e9995f0 // udot v16.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995f6 // udot v22.4s, v15.16b, v9.16b\n"
"ext v9.16b, v9.16b, v9.16b, #0x1\n"
- ".inst 0x6e85967e // udot v30.4s, v19.16b, v5.16b\n"
- ".inst 0x6e89967c // udot v28.4s, v19.16b, v9.16b\n"
- "sqrdmulh v31.4s, v31.4s, v20.4s\n"
- "mov v7.16b, v18.16b\n .inst 0x6e899587 // udot v7.4s, v12.16b, v9.16b\n"
- ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
- "mls v30.4s, v18.4s, v16.4s\n"
- "mls v29.4s, v17.4s, v16.4s\n"
- "mls v28.4s, v7.4s, v16.4s\n"
- "and v16.16b, v31.16b, v26.16b\n"
+ ".inst 0x6e86965f // udot v31.4s, v18.16b, v6.16b\n"
+ "ext v6.16b, v6.16b, v6.16b, #0x1\n"
+ ".inst 0x6e89943a // udot v26.4s, v1.16b, v9.16b\n"
+ ".inst 0x6e999643 // udot v3.4s, v18.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x6e869697 // udot v23.4s, v20.16b, v6.16b\n"
+ ".inst 0x6e8695f5 // udot v21.4s, v15.16b, v6.16b\n"
+ "mls v31.4s, v22.4s, v24.4s\n"
+ ".inst 0x6e84969a // udot v26.4s, v20.16b, v4.16b\n"
+ "mls v3.4s, v16.4s, v24.4s\n"
+ ".inst 0x6e999657 // udot v23.4s, v18.16b, v25.16b\n"
+ "mov v16.16b, v21.16b\n .inst 0x6e9995f0 // udot v16.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e8995f5 // udot v21.4s, v15.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e86965a // udot v26.4s, v18.16b, v6.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "mls v23.4s, v16.4s, v24.4s\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "mls v26.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v17.4s\n"
+ "and v18.16b, v3.16b, v19.16b\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v30.4s, v30.4s, v20.4s\n"
- "sqrdmulh v29.4s, v29.4s, v20.4s\n"
- "sqrdmulh v28.4s, v28.4s, v20.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "and v18.16b, v30.16b, v26.16b\n"
- "and v17.16b, v29.16b, v26.16b\n"
- "and v16.16b, v28.16b, v26.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v17.16b, v23.16b, v19.16b\n"
+ "and v16.16b, v26.16b, v19.16b\n"
+ "sqadd v3.4s, v3.4s, v18.4s\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqadd v30.4s, v30.4s, v18.4s\n"
- "sqadd v29.4s, v29.4s, v17.4s\n"
- "sqadd v28.4s, v28.4s, v16.4s\n"
- "srshl v31.4s, v31.4s, v26.4s\n"
- "srshl v30.4s, v30.4s, v26.4s\n"
- "srshl v29.4s, v29.4s, v26.4s\n"
- "srshl v28.4s, v28.4s, v26.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "sqadd v26.4s, v26.4s, v16.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v12.4s\n"
+ "smax v31.4s, v31.4s, v7.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v7.4s\n"
"smin v31.4s, v31.4s, v11.4s\n"
- "smin v30.4s, v30.4s, v11.4s\n"
- "smin v29.4s, v29.4s, v11.4s\n"
- "smin v28.4s, v28.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "smin v3.4s, v3.4s, v11.4s\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smax v26.4s, v26.4s, v7.4s\n"
+ "smax v23.4s, v23.4s, v7.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "uzp1 v30.16b, v30.16b, v30.16b\n"
- "uzp1 v29.16b, v29.16b, v29.16b\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smin v23.4s, v23.4s, v11.4s\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
"32:" // Oddments: Unroll 3: Oddment store
- "add x25, x25, x27\n"
- "add x24, x24, x27\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
+ "add x11, x11, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
"tbz x20, #1, 33f\n"
- "st1 { v31.h }[0], [x25], #0x2\n"
- "st1 { v30.h }[0], [x24], #0x2\n"
- "st1 { v29.h }[0], [x23], #0x2\n"
- "st1 { v28.h }[0], [x22], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "st1 { v26.h }[0], [x10], #0x2\n"
+ "st1 { v3.h }[0], [x9], #0x2\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
"tbz x20, #0, 34f\n"
- "st1 { v31.b }[2], [x25], #0x1\n"
- "st1 { v30.b }[2], [x24], #0x1\n"
- "st1 { v29.b }[2], [x23], #0x1\n"
- "st1 { v28.b }[2], [x22], #0x1\n"
+ "st1 { v31.b }[2], [x11], #0x1\n"
+ "st1 { v26.b }[2], [x10], #0x1\n"
+ "st1 { v3.b }[2], [x9], #0x1\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
"b 34f\n"
"33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
- "st1 { v31.b }[0], [x25], #0x1\n"
- "st1 { v30.b }[0], [x24], #0x1\n"
- "st1 { v29.b }[0], [x23], #0x1\n"
- "st1 { v28.b }[0], [x22], #0x1\n"
+ "st1 { v31.b }[0], [x11], #0x1\n"
+ "st1 { v26.b }[0], [x10], #0x1\n"
+ "st1 { v3.b }[0], [x9], #0x1\n"
+ "st1 { v23.b }[0], [x28], #0x1\n"
"34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
"35:" // End
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index d5b55cb9c5..56a81849ee 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,1072 +91,1072 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v14.16b }, [x20]\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v19.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_minval]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v29.8h }, [x21]\n"
- "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "usubl v23.8h, v23.8b, v19.8b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "usubl v16.8h, v16.8b, v19.8b\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "usubl v26.8h, v26.8b, v19.8b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "usubl v18.8h, v18.8b, v19.8b\n"
- "usubl v31.8h, v31.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "lsr x11, x8, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.16b }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v16.16b }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x11, 3f\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v25.8h, v25.8b, v19.8b\n"
- "usubl v20.8h, v20.8b, v19.8b\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "usubl v25.8h, v25.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
"add x20, x20, #0x20\n"
+ "usubl v9.8h, v9.8b, v16.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x23, x22, [x15, #0x0]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d22, [x23, x17]\n"
- "ldr d4, [x22, x17]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d8, [x21, x17]\n"
- "ldr d27, [x20, x17]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
"ldr x20, [x15, #0x20]\n"
- "ldr d15, [x20, x17]\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q3, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q28, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x27, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "ldr x26, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr q17, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q28, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x24, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x23, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x21, [x15, #0x40]\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "ldr x26, [x15, #0x48]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr d21, [x24, x17]\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x21, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x25, [x15, #0x58]\n"
"ldr x24, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x23, [x15, #0x68]\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x22, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x21, [x15, #0x78]\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x27, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x26, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
"add x14, x14, #0x48\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "subs x11, x11, #0x1\n"
+ "smlal v31.4s, v19.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"add x13, x13, #0x20\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x25, x17]\n"
"add x12, x12, #0x20\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x25, x17]\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x24, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x23, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "usubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d8, [x21, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v2.4s, v18.4h, v4.4h\n"
+ "smlal2 v1.4s, v18.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v9.8h\n"
+ "ldr d19, [x24, x17]\n"
+ "smlal v8.4s, v18.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v18.8h, v15.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x21, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v3.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v27.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v28.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v8.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v3.4s\n"
- "smlal v10.4s, v8.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v3.4s\n"
- "smlal2 v30.4s, v8.8h, v20.8h\n"
- "smlal2 v6.4s, v8.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v3.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v20.16b, v0.16b, v28.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v31.16b, v30.16b, v28.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v18.16b, v6.16b, v28.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "usubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "usubl v4.8h, v4.8b, v11.8b\n"
+ "smlal v31.4s, v18.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v19.4h, v3.4h\n"
+ "smlal2 v24.4s, v19.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v18.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v19.4h, v6.4h\n"
+ "smlal2 v1.4s, v19.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal v31.4s, v4.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v9.8h\n"
+ "smlal v0.4s, v4.4h, v9.4h\n"
+ "smlal2 v24.4s, v4.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v28.4s\n"
+ "smlal2 v27.4s, v4.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "and v18.16b, v2.16b, v26.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v28.4s\n"
+ "and v4.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v26.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v26.16b\n"
+ "sqadd v2.4s, v2.4s, v18.4s\n"
+ "and v19.16b, v31.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v4.4s\n"
"sshr v20.4s, v20.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v3.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
"sqadd v0.4s, v0.4s, v20.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v31.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v28.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v28.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v26.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v27.4s, v27.4s, v3.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v28.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x23, x22, [x15, #0x0]\n"
- "usubl v23.8h, v23.8b, v19.8b\n"
- "usubl v16.8h, v16.8b, v19.8b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "ldr d22, [x23, x17]\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
- "ldr d4, [x22, x17]\n"
- "ldr d8, [x21, x17]\n"
- "usubl v26.8h, v26.8b, v19.8b\n"
- "usubl v18.8h, v18.8b, v19.8b\n"
- "ldr d27, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
+ "usubl v9.8h, v9.8b, v16.8b\n"
"ldr x20, [x15, #0x20]\n"
- "usubl v31.8h, v31.8b, v19.8b\n"
- "usubl v25.8h, v25.8b, v19.8b\n"
- "ldr d15, [x20, x17]\n"
- "usubl v20.8h, v20.8b, v19.8b\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q28, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q3, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q17, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x23, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x22, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x21, [x15, #0x30]\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
"ldr x26, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
"ldr x25, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr d21, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x24, [x15, #0x58]\n"
"ldr x23, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x22, [x15, #0x68]\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x21, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x20, [x15, #0x78]\n"
- "tst x7, #0x7\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x26, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x25, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x25, x17]\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "tst x8, #0x7\n"
"add x13, x13, #0x20\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
"add x12, x12, #0x20\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x24, x17]\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x23, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x22, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x21, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "usubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d16, [x20, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "usubl v16.8h, v16.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v31.4s, v18.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x24, x17]\n"
+ "smlal v2.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v18.8h, v9.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v19.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v19.8h, v15.8h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v28.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v1.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v1.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v3.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v16.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v28.4s\n"
- "smlal v10.4s, v16.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v28.4s\n"
- "smlal2 v30.4s, v16.8h, v20.8h\n"
- "smlal2 v6.4s, v16.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v28.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v15.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "usubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v31.4s, v19.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v18.4h, v3.4h\n"
+ "smlal2 v24.4s, v18.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v19.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v18.4h, v6.4h\n"
+ "smlal2 v1.4s, v18.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v19.4h, v9.4h\n"
+ "smlal v31.4s, v29.4h, v7.4h\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal v0.4s, v29.4h, v9.4h\n"
+ "smlal2 v24.4s, v29.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v26.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+ "smlal2 v27.4s, v29.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v26.4s\n"
+ "and v25.16b, v2.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v26.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "and v22.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v28.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v17.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v0.16b, v3.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v23.16b, v30.16b, v3.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v21.16b, v6.16b, v3.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sqadd v2.4s, v2.4s, v25.4s\n"
+ "and v19.16b, v31.16b, v28.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v23.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v21.4s\n"
- "srshl v24.4s, v24.4s, v3.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
+ "and v10.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v22.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v28.4s\n"
+ "srshl v8.4s, v8.4s, v28.4s\n"
+ "sqadd v30.4s, v30.4s, v10.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v3.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
"beq 64f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v9.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v24.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[2], [x20]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v2.4s }, [x20], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v9.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[2], [x20]\n"
+ "tbz x8, #1, 6f\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "usubl v23.8h, v23.8b, v19.8b\n"
- "usubl v16.8h, v16.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x24, x23, [x15, #0x0]\n"
- "usubl v1.8h, v1.8b, v19.8b\n"
- "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "usubl v25.8h, v25.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
"ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
- "usubl v26.8h, v26.8b, v19.8b\n"
- "usubl v18.8h, v18.8b, v19.8b\n"
- "usubl v31.8h, v31.8b, v19.8b\n"
- "usubl v25.8h, v25.8b, v19.8b\n"
- "usubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v9.8h, v9.8b, v16.8b\n"
"add x24, x24, x17\n"
"add x23, x23, x17\n"
+ "ldr x20, [x15, #0x20]\n"
"add x22, x22, x17\n"
"add x21, x21, x17\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v22.s }[0], [x24], #0x4\n"
- "ld1 { v4.s }[0], [x23], #0x4\n"
- "ld1 { v8.s }[0], [x22], #0x4\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v22.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v8.h }[2], [x22], #0x2\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "ld1 { v8.b }[6], [x22]\n"
- "ld1 { v27.b }[6], [x21]\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v21.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v22.s }[0], [x21], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v21.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v21.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v22.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "ld1 { v8.b }[4], [x22]\n"
- "ld1 { v27.b }[4], [x21]\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v21.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v22.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v22.h }[0], [x24], #0x2\n"
- "ld1 { v4.h }[0], [x23], #0x2\n"
- "ld1 { v8.h }[0], [x22], #0x2\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v8.b }[2], [x22]\n"
- "ld1 { v27.b }[2], [x21]\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x8, #1, 10f\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v21.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v22.h }[0], [x21], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v21.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v22.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[0], [x24]\n"
- "ld1 { v4.b }[0], [x23]\n"
- "ld1 { v8.b }[0], [x22]\n"
- "ld1 { v27.b }[0], [x21]\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v21.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v22.b }[0], [x21]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x20, [x15, #0x28]\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
"add x20, x20, x17\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v21.8h, v21.8b, v14.8b\n"
- "smlal v2.4s, v21.4h, v31.4h\n"
- "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x30]\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v0.4s, v17.4h, v6.4h\n"
+ "smlal2 v24.4s, v17.8h, v6.8h\n"
"add x20, x20, x17\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x38]\n"
- "smlal v10.4s, v28.4h, v20.4h\n"
- "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "smlal v31.4s, v16.4h, v9.4h\n"
+ "smlal2 v27.4s, v16.8h, v9.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x8, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v9.4s, v22.4h, v16.4h\n"
- "smlal2 v24.4s, v22.8h, v16.8h\n"
- "smlal v7.4s, v22.4h, v23.4h\n"
- "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "smlal v2.4s, v17.4h, v4.4h\n"
+ "smlal2 v1.4s, v17.8h, v4.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v30.4s, v17.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v21.8h, v21.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x48]\n"
- "smlal v9.4s, v21.4h, v1.4h\n"
- "smlal2 v24.4s, v21.8h, v1.8h\n"
- "smlal v7.4s, v21.4h, v16.4h\n"
- "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "smlal v2.4s, v16.4h, v5.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x50]\n"
- "smlal v9.4s, v28.4h, v20.4h\n"
- "smlal2 v24.4s, v28.8h, v20.8h\n"
- "smlal v7.4s, v28.4h, v25.4h\n"
- "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "smlal v2.4s, v17.4h, v9.4h\n"
+ "smlal2 v1.4s, v17.8h, v9.8h\n"
+ "smlal v8.4s, v17.4h, v7.4h\n"
+ "smlal2 v30.4s, v17.8h, v7.8h\n"
+ "smlal v0.4s, v17.4h, v10.4h\n"
+ "smlal2 v24.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v25.4h\n"
"add x20, x20, x17\n"
- "smlal v2.4s, v28.4h, v18.4h\n"
- "smlal2 v30.4s, v28.8h, v18.8h\n"
- "smlal v10.4s, v28.4h, v26.4h\n"
- "smlal2 v6.4s, v28.8h, v26.8h\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "smlal2 v27.4s, v17.8h, v25.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v9.4s, v8.4h, v5.4h\n"
- "smlal2 v24.4s, v8.8h, v5.8h\n"
- "smlal v2.4s, v8.4h, v23.4h\n"
- "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "smlal v2.4s, v16.4h, v3.4h\n"
+ "smlal2 v1.4s, v16.8h, v3.8h\n"
+ "smlal v0.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v7.4s, v8.4h, v18.4h\n"
- "smlal2 v0.4s, v8.8h, v18.8h\n"
- "smlal v10.4s, v8.4h, v1.4h\n"
- "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "smlal v8.4s, v17.4h, v10.4h\n"
+ "smlal2 v30.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v5.4h\n"
+ "smlal2 v27.4s, v17.8h, v5.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v17.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v17.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[6], [x20]\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[4], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v17.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[2], [x20]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[0], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v17.8h, v17.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v9.4s, v17.4h, v31.4h\n"
- "smlal2 v24.4s, v17.8h, v31.8h\n"
- "smlal v2.4s, v17.4h, v5.4h\n"
- "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "smlal v2.4s, v16.4h, v6.4h\n"
+ "smlal2 v1.4s, v16.8h, v6.8h\n"
+ "smlal v0.4s, v16.4h, v3.4h\n"
+ "smlal2 v24.4s, v16.8h, v3.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v23.8h, v23.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x70]\n"
- "smlal v7.4s, v23.4h, v20.4h\n"
- "smlal2 v0.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v18.4h\n"
- "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v30.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v10.4h\n"
+ "smlal2 v27.4s, v17.8h, v10.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v5.8h, v5.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x78]\n"
- "smlal v2.4s, v5.4h, v25.4h\n"
- "smlal2 v30.4s, v5.8h, v25.8h\n"
- "smlal v10.4s, v5.4h, v31.4h\n"
- "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "smlal v0.4s, v16.4h, v7.4h\n"
+ "smlal2 v24.4s, v16.8h, v7.8h\n"
+ "smlal v31.4s, v16.4h, v6.4h\n"
+ "smlal2 v27.4s, v16.8h, v6.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v23.8h, v23.8b, v14.8b\n"
- "smlal v2.4s, v23.4h, v20.4h\n"
- "smlal2 v30.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v25.4h\n"
- "smlal2 v6.4s, v23.8h, v25.8h\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v15.4s }, [x13], #0x10\n"
- "ld1 { v19.4s }, [x12], #0x10\n"
- "tbz x7, #1, 56f\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v0.4s, v17.4h, v9.4h\n"
+ "smlal2 v24.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v7.4h\n"
+ "smlal2 v27.4s, v17.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v16.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v18.d }[0], [x13], #0x8\n"
"ld1 { v22.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[2], [x13]\n"
"ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[0], [x13]\n"
"ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v15.d }[0], [x13], #0x8\n"
- "ld1 { v19.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x12]\n"
+ "tbz x8, #1, 58f\n"
+ "ld1 { v16.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[0], [x13]\n"
- "ld1 { v19.s }[0], [x12]\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v9.4s, v9.4s, v15.4s\n"
- "and v17.16b, v9.16b, v19.16b\n"
- "add x11, x11, x16\n"
+ "sqrdmulh v2.4s, v2.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
"add x10, x10, x16\n"
- "sqrdmulh v24.4s, v24.4s, v18.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
"add x9, x9, x16\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v16.4s\n"
"add x28, x28, x16\n"
- "and v20.16b, v24.16b, v22.16b\n"
- "sqrdmulh v7.4s, v7.4s, v15.4s\n"
- "sqrdmulh v2.4s, v2.4s, v15.4s\n"
- "sqrdmulh v10.4s, v10.4s, v15.4s\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v21.16b, v7.16b, v19.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v15.16b, v2.16b, v19.16b\n"
+ "add x27, x27, x16\n"
+ "sqrdmulh v31.4s, v31.4s, v16.4s\n"
"sqrdmulh v30.4s, v30.4s, v18.4s\n"
- "and v23.16b, v10.16b, v19.16b\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
+ "and v17.16b, v2.16b, v23.16b\n"
+ "and v16.16b, v1.16b, v22.16b\n"
+ "and v21.16b, v8.16b, v23.16b\n"
+ "and v20.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v18.16b, v0.16b, v22.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v17.16b, v30.16b, v22.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v28.16b, v6.16b, v22.16b\n"
- "sqadd v7.4s, v7.4s, v21.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v18.16b, v24.16b, v22.16b\n"
+ "sqadd v2.4s, v2.4s, v17.4s\n"
+ "and v17.16b, v31.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v22.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v23.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v19.4s\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v19.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v6.4s, v6.4s, v28.4s\n"
- "srshl v24.4s, v24.4s, v22.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v22.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v23.4s\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v22.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v22.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "tbz x7, #2, 61f\n"
- "st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v7.s }[0], [x10], #0x4\n"
- "st1 { v2.s }[0], [x9], #0x4\n"
- "st1 { v10.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 60f\n"
- "st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v7.h }[2], [x10], #0x2\n"
- "st1 { v2.h }[2], [x9], #0x2\n"
- "st1 { v10.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v7.b }[6], [x10], #0x1\n"
- "st1 { v2.b }[6], [x9], #0x1\n"
- "st1 { v10.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v2.s }[0], [x10], #0x4\n"
+ "st1 { v8.s }[0], [x9], #0x4\n"
+ "st1 { v0.s }[0], [x28], #0x4\n"
+ "st1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v2.h }[2], [x10], #0x2\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v0.h }[2], [x28], #0x2\n"
+ "st1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[6], [x10], #0x1\n"
+ "st1 { v8.b }[6], [x9], #0x1\n"
+ "st1 { v0.b }[6], [x28], #0x1\n"
+ "st1 { v31.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v7.b }[4], [x10], #0x1\n"
- "st1 { v2.b }[4], [x9], #0x1\n"
- "st1 { v10.b }[4], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[4], [x10], #0x1\n"
+ "st1 { v8.b }[4], [x9], #0x1\n"
+ "st1 { v0.b }[4], [x28], #0x1\n"
+ "st1 { v31.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v7.h }[0], [x10], #0x2\n"
- "st1 { v2.h }[0], [x9], #0x2\n"
- "st1 { v10.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v7.b }[2], [x10], #0x1\n"
- "st1 { v2.b }[2], [x9], #0x1\n"
- "st1 { v10.b }[2], [x28], #0x1\n"
+ "tbz x8, #1, 62f\n"
+ "st1 { v2.h }[0], [x10], #0x2\n"
+ "st1 { v8.h }[0], [x9], #0x2\n"
+ "st1 { v0.h }[0], [x28], #0x2\n"
+ "st1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[2], [x10], #0x1\n"
+ "st1 { v8.b }[2], [x9], #0x1\n"
+ "st1 { v0.b }[2], [x28], #0x1\n"
+ "st1 { v31.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v7.b }[0], [x10], #0x1\n"
- "st1 { v2.b }[0], [x9], #0x1\n"
- "st1 { v10.b }[0], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[0], [x10], #0x1\n"
+ "st1 { v8.b }[0], [x9], #0x1\n"
+ "st1 { v0.b }[0], [x28], #0x1\n"
+ "st1 { v31.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index c4184622b0..4d56009adc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,1294 +100,1294 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v6.16b }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x17, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "ld1r { v14.16b }, [x21]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v17.8h }, [x21]\n"
- "ld1r { v24.8h }, [x20]\n"
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "usubl v11.8h, v11.8b, v15.8b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "usubl v22.8h, v22.8b, v15.8b\n"
- "usubl v14.8h, v14.8b, v15.8b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "usubl v28.8h, v28.8b, v15.8b\n"
- "usubl v18.8h, v18.8b, v15.8b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v9.8h, v9.8b, v15.8b\n"
- "usubl v26.8h, v26.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
+ "ld1r { v23.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x17, 3f\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "subs x17, x17, #0x1\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
"add x20, x20, #0x20\n"
+ "usubl v7.8h, v7.8b, v14.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d25, [x27, x17]\n"
- "ldr d27, [x26, x17]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d1, [x25, x17]\n"
- "ldr d2, [x24, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "ldr d12, [x23, x17]\n"
- "ldr d16, [x22, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "ldr d23, [x21, x17]\n"
- "ldr d10, [x20, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "ldr d18, [x20, x3]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q30, [x13, #0x0]\n"
- "ldr q29, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d18, [x22, x17]\n"
- "ldr d16, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "usubl v18.8h, v18.8b, v6.8b\n"
- "ldr x21, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q30, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x24, [x5, #0x58]\n"
+ "ldr x23, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x22, [x5, #0x60]\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x24, x3]\n"
+ "ldr x12, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x11, [x5, #0x40]\n"
+ "ldr x10, [x5, #0x70]\n"
+ "add x6, x6, #0x48\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x23, x3]\n"
+ "ldr x9, [x5, #0x98]\n"
+ "subs x17, x17, #0x1\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x28, [x5, #0x50]\n"
+ "ldr x27, [x5, #0x48]\n"
+ "add x7, x7, #0x20\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x22, x3]\n"
+ "ldr x26, [x5, #0x90]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x25, [x5, #0xa8]\n"
+ "ldr x24, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x21, x3]\n"
+ "ldr x23, [x5, #0xb0]\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "ldr x22, [x5, #0xb8]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x21, [x5, #0xc0]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x21, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "usubl v22.8h, v22.8b, v6.8b\n"
- "add x14, x14, #0x48\n"
- "smlal v20.4s, v18.4h, v7.4h\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v30.4s\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v18.8h, v7.8h\n"
- "and v28.16b, v5.16b, v29.16b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x11, x3]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x10, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x9, x3]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x28, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "ldr d27, [x26, x3]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x25, x3]\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x24, x3]\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x23, x3]\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x22, x3]\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "usubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "add x8, x8, #0x20\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "and v2.16b, v19.16b, v30.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "and v11.16b, v1.16b, v27.16b\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v2.4s\n"
+ "and v29.16b, v8.16b, v30.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "and v20.16b, v10.16b, v30.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "and v28.16b, v3.16b, v30.16b\n"
+ "sqadd v1.4s, v1.4s, v11.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v7.16b, v4.16b, v27.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v2.16b, v21.16b, v27.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "add x12, x12, #0x20\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v30.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v21.16b, v29.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v23.16b, v20.16b, v29.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v9.16b, v19.16b, v29.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v12.4s, v12.4s, #0x1f\n"
- "and v18.16b, v8.16b, v25.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v22.16b, v0.16b, v25.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v12.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v23.4s\n"
+ "and v22.16b, v24.16b, v27.16b\n"
+ "sqadd v8.4s, v8.4s, v29.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v28.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v9.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v29.4s\n"
- "sqadd v0.4s, v0.4s, v22.4s\n"
- "srshl v19.4s, v19.4s, v29.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v30.4s\n"
+ "sqadd v4.4s, v4.4s, v7.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v2.4s\n"
+ "srshl v3.4s, v3.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "usubl v11.8h, v11.8b, v15.8b\n"
- "usubl v22.8h, v22.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "usubl v14.8h, v14.8b, v15.8b\n"
- "usubl v28.8h, v28.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ldr d25, [x27, x17]\n"
- "usubl v18.8h, v18.8b, v15.8b\n"
- "usubl v9.8h, v9.8b, v15.8b\n"
- "ldr d27, [x26, x17]\n"
- "ldr d1, [x25, x17]\n"
- "usubl v26.8h, v26.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "ldr d2, [x24, x17]\n"
- "ldr d12, [x23, x17]\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "ldr d16, [x22, x17]\n"
- "ldr d23, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "ldr d10, [x20, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "usubl v7.8h, v7.8b, v14.8b\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "ldr d18, [x20, x3]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q29, [x13, #0x0]\n"
- "ldr q30, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d16, [x22, x17]\n"
- "ldr d18, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "usubl v18.8h, v18.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
- "tst x7, #0x7\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "usubl v22.8h, v22.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal v19.4s, v18.4h, v7.4h\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v29.4s\n"
- "add x12, x12, #0x20\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "and v16.16b, v5.16b, v30.16b\n"
- "smlal2 v31.4s, v18.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v29.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v29.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "ldr q30, [x7, #0x0]\n"
+ "ldr q17, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x24, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x23, [x5, #0x60]\n"
+ "ldr x10, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x22, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x20, x3]\n"
+ "ldr x21, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x20, [x5, #0x40]\n"
+ "ldr x9, [x5, #0x70]\n"
+ "tst x2, #0x7\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr x28, [x5, #0x98]\n"
+ "add x7, x7, #0x20\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x27, [x5, #0x50]\n"
+ "ldr x26, [x5, #0x48]\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x23, x3]\n"
+ "ldr x25, [x5, #0x90]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x24, [x5, #0xa8]\n"
+ "ldr x23, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x10, x3]\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x22, [x5, #0xb0]\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x21, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "ldr x21, [x5, #0xb8]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x9, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x28, x3]\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x27, x3]\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x26, x3]\n"
+ "ldr d27, [x25, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x23, x3]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x22, x3]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "usubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "add x8, x8, #0x20\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "and v5.16b, v19.16b, v17.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "and v16.16b, v1.16b, v27.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v30.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v23.16b, v21.16b, v30.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v27.16b, v20.16b, v30.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v22.16b, v19.16b, v30.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v14.16b, v8.16b, v25.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v18.16b, v0.16b, v25.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v23.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v27.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v5.4s\n"
+ "and v30.16b, v8.16b, v17.16b\n"
+ "and v20.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v2.16b, v3.16b, v17.16b\n"
+ "and v11.16b, v4.16b, v27.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v27.16b\n"
+ "and v16.16b, v24.16b, v27.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v30.4s\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v30.4s\n"
- "srshl v21.4s, v21.4s, v30.4s\n"
- "sqadd v8.4s, v8.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqadd v3.4s, v3.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v17.4s\n"
+ "srshl v8.4s, v8.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v11.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "srshl v3.4s, v3.4s, v17.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 88f\n"
- "add x14, x14, #0x48\n"
+ "add x6, x6, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v5.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v3.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[2], [x20]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v19.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[2], [x20]\n"
+ "tbz x2, #1, 6f\n"
+ "ld1 { v19.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "usubl v11.8h, v11.8b, v15.8b\n"
- "usubl v22.8h, v22.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "usubl v14.8h, v14.8b, v15.8b\n"
- "usubl v28.8h, v28.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "usubl v18.8h, v18.8b, v15.8b\n"
- "usubl v9.8h, v9.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "usubl v26.8h, v26.8b, v15.8b\n"
- "usubl v7.8h, v7.8b, v15.8b\n"
- "usubl v4.8h, v4.8b, v15.8b\n"
- "add x27, x27, x17\n"
- "add x26, x26, x17\n"
- "add x25, x25, x17\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "ld1 { v1.s }[0], [x25], #0x4\n"
- "ld1 { v2.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v16.s }[0], [x22], #0x4\n"
- "ld1 { v23.s }[0], [x21], #0x4\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
- "ld1 { v2.h }[2], [x24], #0x2\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
- "ld1 { v16.h }[2], [x22], #0x2\n"
- "ld1 { v23.h }[2], [x21], #0x2\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v27.b }[6], [x26]\n"
- "ld1 { v1.b }[6], [x25]\n"
- "ld1 { v2.b }[6], [x24]\n"
- "ld1 { v12.b }[6], [x23]\n"
- "ld1 { v16.b }[6], [x22]\n"
- "ld1 { v23.b }[6], [x21]\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "usubl v7.8h, v7.8b, v14.8b\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v20.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v0.s }[0], [x21], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v20.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v0.h }[2], [x21], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v20.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v6.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v0.b }[6], [x21]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v27.b }[4], [x26]\n"
- "ld1 { v1.b }[4], [x25]\n"
- "ld1 { v2.b }[4], [x24]\n"
- "ld1 { v12.b }[4], [x23]\n"
- "ld1 { v16.b }[4], [x22]\n"
- "ld1 { v23.b }[4], [x21]\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v20.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v6.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v0.b }[4], [x21]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "ld1 { v1.h }[0], [x25], #0x2\n"
- "ld1 { v2.h }[0], [x24], #0x2\n"
- "ld1 { v12.h }[0], [x23], #0x2\n"
- "ld1 { v16.h }[0], [x22], #0x2\n"
- "ld1 { v23.h }[0], [x21], #0x2\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v27.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v12.b }[2], [x23]\n"
- "ld1 { v16.b }[2], [x22]\n"
- "ld1 { v23.b }[2], [x21]\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x2, #1, 10f\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v20.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v0.h }[0], [x21], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v20.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v6.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v0.b }[2], [x21]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v27.b }[0], [x26]\n"
- "ld1 { v1.b }[0], [x25]\n"
- "ld1 { v2.b }[0], [x24]\n"
- "ld1 { v12.b }[0], [x23]\n"
- "ld1 { v16.b }[0], [x22]\n"
- "ld1 { v23.b }[0], [x21]\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v20.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v6.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v0.b }[0], [x21]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x20, [x15, #0x40]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x40]\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x2, #1, 14f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v15.8h, v15.8b, v6.8b\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v21.4s, v15.4h, v18.4h\n"
- "smlal2 v8.4s, v15.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x48]\n"
+ "smlal v8.4s, v30.4h, v27.4h\n"
+ "smlal2 v4.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 18f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x50]\n"
- "smlal v21.4s, v16.4h, v9.4h\n"
- "smlal2 v8.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x50]\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v5.4s, v16.4h, v9.4h\n"
- "smlal2 v3.4s, v16.8h, v9.8h\n"
- "smlal v21.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v19.4s, v17.4h, v2.4h\n"
+ "smlal2 v1.4s, v17.8h, v2.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v4.4s, v17.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 26f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v20.4s, v16.4h, v28.4h\n"
- "smlal2 v0.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v5.4s, v16.4h, v26.4h\n"
- "smlal2 v3.4s, v16.8h, v26.8h\n"
- "smlal v20.4s, v16.4h, v11.4h\n"
- "smlal2 v0.4s, v16.8h, v11.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v17.4h, v22.4h\n"
+ "smlal2 v1.4s, v17.8h, v22.8h\n"
+ "smlal v10.4s, v17.4h, v16.4h\n"
+ "smlal2 v21.4s, v17.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 34f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v20.4s, v16.4h, v18.4h\n"
- "smlal2 v0.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v10.4s, v30.4h, v27.4h\n"
+ "smlal2 v21.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v16.4h, v7.4h\n"
- "smlal2 v3.4s, v16.8h, v7.8h\n"
- "smlal v20.4s, v16.4h, v22.4h\n"
- "smlal2 v0.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v19.4s, v17.4h, v5.4h\n"
+ "smlal2 v1.4s, v17.8h, v5.8h\n"
+ "smlal v10.4s, v17.4h, v11.4h\n"
+ "smlal2 v21.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 41f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
+ "tbz x2, #1, 40f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 42f\n"
+ "tbz x2, #1, 42f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x80]\n"
- "smlal v19.4s, v16.4h, v18.4h\n"
- "smlal2 v31.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v3.4s, v16.4h, v27.4h\n"
+ "smlal2 v24.4s, v16.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x88]\n"
- "smlal v21.4s, v16.4h, v7.4h\n"
- "smlal2 v8.4s, v16.8h, v7.8h\n"
- "smlal v19.4s, v16.4h, v22.4h\n"
- "smlal2 v31.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v8.4s, v17.4h, v5.4h\n"
+ "smlal2 v4.4s, v17.8h, v5.8h\n"
+ "smlal v3.4s, v17.4h, v11.4h\n"
+ "smlal2 v24.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 49f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
+ "tbz x2, #1, 48f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x7, #1, 50f\n"
+ "tbz x2, #1, 50f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x90]\n"
- "smlal v19.4s, v16.4h, v9.4h\n"
- "smlal2 v31.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v3.4s, v16.4h, v2.4h\n"
+ "smlal2 v24.4s, v16.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x98]\n"
- "smlal v20.4s, v16.4h, v26.4h\n"
- "smlal2 v0.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 57f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v10.4s, v17.4h, v22.4h\n"
+ "smlal2 v21.4s, v17.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 57f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 56f\n"
+ "tbz x2, #1, 56f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x7, #1, 58f\n"
+ "tbz x2, #1, 58f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal v21.4s, v16.4h, v4.4h\n"
- "smlal2 v8.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v14.4h\n"
- "smlal2 v31.4s, v16.8h, v14.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 61f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 60f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v8.4s, v16.4h, v7.4h\n"
+ "smlal2 v4.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v29.4h\n"
+ "smlal2 v24.4s, v16.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 62f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 65f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v10.4s, v17.4h, v5.4h\n"
+ "smlal2 v21.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 65f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 64f\n"
+ "tbz x2, #1, 64f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 66f\n"
+ "tbz x2, #1, 66f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb0]\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v0.4s, v16.8h, v9.8h\n"
- "smlal v19.4s, v16.4h, v28.4h\n"
- "smlal2 v31.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 69f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 68f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v10.4s, v16.4h, v2.4h\n"
+ "smlal2 v21.4s, v16.8h, v2.8h\n"
+ "smlal v3.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x7, #1, 70f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 70f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 73f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v3.4s, v17.4h, v5.4h\n"
+ "smlal2 v24.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 73f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 72f\n"
+ "tbz x2, #1, 72f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x7, #1, 74f\n"
+ "tbz x2, #1, 74f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal v20.4s, v16.4h, v4.4h\n"
- "smlal2 v0.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v26.4h\n"
- "smlal2 v31.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 77f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 76f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v10.4s, v16.4h, v7.4h\n"
+ "smlal2 v21.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v22.4h\n"
+ "smlal2 v24.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x7, #1, 78f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 78f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v19.4s, v16.4h, v4.4h\n"
- "smlal2 v31.4s, v16.8h, v4.8h\n"
- "tbz x7, #2, 81f\n"
- "ld1 { v14.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x12], #0x10\n"
- "tbz x7, #1, 80f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v12.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x12]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "smlal v3.4s, v17.4h, v7.4h\n"
+ "smlal2 v24.4s, v17.8h, v7.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v16.4s }, [x7], #0x10\n"
+ "ld1 { v22.4s }, [x8], #0x10\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v0.d }[0], [x7], #0x8\n"
+ "ld1 { v31.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[2], [x7]\n"
+ "ld1 { v31.s }[2], [x8]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v12.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[0], [x7]\n"
+ "ld1 { v31.s }[0], [x8]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 82f\n"
- "ld1 { v14.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x12]\n"
+ "tbz x2, #1, 82f\n"
+ "ld1 { v16.d }[0], [x7], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[2], [x7]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[0], [x7]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v28.16b, v5.16b, v25.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v3.4s, v3.4s, v18.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v16.16b, v3.16b, v12.16b\n"
- "sqrdmulh v21.4s, v21.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v14.4s\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v0.4s\n"
+ "add x16, x16, x4\n"
+ "add x15, x15, x4\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "add x14, x14, x4\n"
+ "add x13, x13, x4\n"
+ "sqrdmulh v3.4s, v3.4s, v16.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v0.4s\n"
+ "and v17.16b, v19.16b, v22.16b\n"
+ "and v16.16b, v1.16b, v31.16b\n"
+ "and v15.16b, v8.16b, v22.16b\n"
+ "and v20.16b, v10.16b, v22.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v14.16b, v21.16b, v25.16b\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v6.16b, v20.16b, v25.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v31.4s, v31.4s, v18.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "and v18.16b, v8.16b, v12.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "and v7.16b, v0.16b, v12.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v16.16b, v31.16b, v12.16b\n"
- "sqadd v21.4s, v21.4s, v14.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v6.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v26.16b, v4.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v21.16b, v31.16b\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "and v17.16b, v3.16b, v22.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v24.16b, v31.16b\n"
+ "sqadd v8.4s, v8.4s, v15.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v25.4s\n"
- "srshl v21.4s, v21.4s, v25.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v25.4s\n"
- "sqadd v0.4s, v0.4s, v7.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v12.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v12.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v12.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v8.4s, v8.4s, v22.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v31.4s\n"
+ "srshl v3.4s, v3.4s, v22.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "srshl v4.4s, v4.4s, v31.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "tbz x7, #2, 85f\n"
- "st1 { v5.s }[0], [x11], #0x4\n"
- "st1 { v21.s }[0], [x10], #0x4\n"
- "st1 { v20.s }[0], [x9], #0x4\n"
- "st1 { v19.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 84f\n"
- "st1 { v5.h }[2], [x11], #0x2\n"
- "st1 { v21.h }[2], [x10], #0x2\n"
- "st1 { v20.h }[2], [x9], #0x2\n"
- "st1 { v19.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[6], [x11], #0x1\n"
- "st1 { v21.b }[6], [x10], #0x1\n"
- "st1 { v20.b }[6], [x9], #0x1\n"
- "st1 { v19.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "tbz x2, #2, 85f\n"
+ "st1 { v19.s }[0], [x16], #0x4\n"
+ "st1 { v8.s }[0], [x15], #0x4\n"
+ "st1 { v10.s }[0], [x14], #0x4\n"
+ "st1 { v3.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "st1 { v19.h }[2], [x16], #0x2\n"
+ "st1 { v8.h }[2], [x15], #0x2\n"
+ "st1 { v10.h }[2], [x14], #0x2\n"
+ "st1 { v3.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[6], [x16], #0x1\n"
+ "st1 { v8.b }[6], [x15], #0x1\n"
+ "st1 { v10.b }[6], [x14], #0x1\n"
+ "st1 { v3.b }[6], [x13], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[4], [x11], #0x1\n"
- "st1 { v21.b }[4], [x10], #0x1\n"
- "st1 { v20.b }[4], [x9], #0x1\n"
- "st1 { v19.b }[4], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[4], [x16], #0x1\n"
+ "st1 { v8.b }[4], [x15], #0x1\n"
+ "st1 { v10.b }[4], [x14], #0x1\n"
+ "st1 { v3.b }[4], [x13], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 86f\n"
- "st1 { v5.h }[0], [x11], #0x2\n"
- "st1 { v21.h }[0], [x10], #0x2\n"
- "st1 { v20.h }[0], [x9], #0x2\n"
- "st1 { v19.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[2], [x11], #0x1\n"
- "st1 { v21.b }[2], [x10], #0x1\n"
- "st1 { v20.b }[2], [x9], #0x1\n"
- "st1 { v19.b }[2], [x28], #0x1\n"
+ "tbz x2, #1, 86f\n"
+ "st1 { v19.h }[0], [x16], #0x2\n"
+ "st1 { v8.h }[0], [x15], #0x2\n"
+ "st1 { v10.h }[0], [x14], #0x2\n"
+ "st1 { v3.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[2], [x16], #0x1\n"
+ "st1 { v8.b }[2], [x15], #0x1\n"
+ "st1 { v10.b }[2], [x14], #0x1\n"
+ "st1 { v3.b }[2], [x13], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[0], [x11], #0x1\n"
- "st1 { v21.b }[0], [x10], #0x1\n"
- "st1 { v20.b }[0], [x9], #0x1\n"
- "st1 { v19.b }[0], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[0], [x16], #0x1\n"
+ "st1 { v8.b }[0], [x15], #0x1\n"
+ "st1 { v10.b }[0], [x14], #0x1\n"
+ "st1 { v3.b }[0], [x13], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index a3fa93df9c..5798451720 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -113,1743 +113,1743 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x2, x1, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v18.16b }, [x20]\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x14, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v15.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v26.8h }, [x20]\n"
+ "ld1r { v9.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v0.8h }, [x20]\n"
- "mov x3, #0x0\n"
- "mov x4, #0x0\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x16, [x22, #0x0]\n"
- "ldp x15, x14, [x22, #0x10]\n"
- "cbz x2, 3f\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "subs x2, x2, #0x1\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "usubl v14.8h, v14.8b, v13.8b\n"
- "usubl v10.8h, v10.8b, v13.8b\n"
- "ldr d12, [x6, #0x20]\n"
+ "ld1r { v10.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "ldp x8, x17, [x22, #0x0]\n"
+ "ldp x16, x15, [x22, #0x10]\n"
+ "cbz x14, 3f\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "subs x14, x14, #0x1\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "usubl v12.8h, v12.8b, v13.8b\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v12.8h, v12.8b, v9.8b\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "usubl v11.8h, v11.8b, v9.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
"add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ldr d31, [x9, x3]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldr d17, [x28, x3]\n"
- "ldr d30, [x27, x3]\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "ldr d16, [x26, x3]\n"
- "ldr d3, [x25, x3]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "ldr d4, [x24, x3]\n"
- "ldr d25, [x23, x3]\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "ldr d9, [x22, x3]\n"
- "ldr d29, [x21, x3]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "ldr d28, [x20, x3]\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "ldr d26, [x22, x2]\n"
+ "ldr d29, [x21, x2]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr d18, [x20, x2]\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr d2, [x6, #0x28]\n"
- "ldr d27, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d1, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "ldr x21, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x20, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "ldr x20, [x5, #0x90]\n"
- "ldr x23, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x21, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x22, [x5, #0xa0]\n"
- "ldr x21, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v2.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v27.4h\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x20, x3]\n"
- "smlal v20.4s, v16.4h, v2.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal v23.4s, v14.4h, v2.4h\n"
- "ldr x20, [x5, #0xb0]\n"
- "ldr x13, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v27.8h\n"
- "smlal v7.4s, v4.4h, v1.4h\n"
- "ldr x12, [x5, #0xc0]\n"
- "ldr x11, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v2.8h\n"
- "ldr d16, [x23, x3]\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v2.8h\n"
- "ldr d2, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v27.4h\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v27.4h\n"
- "smlal v23.4s, v25.4h, v27.4h\n"
- "ldr x10, [x5, #0xd0]\n"
- "ldr x9, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v1.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x28, [x5, #0xe0]\n"
- "ldr x27, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v27.8h\n"
- "ldr d4, [x22, x3]\n"
- "smlal2 v22.4s, v14.8h, v27.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v27.8h\n"
- "ldr d27, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v1.4h\n"
- "smlal v23.4s, v10.4h, v1.4h\n"
- "ldr x26, [x5, #0xf0]\n"
- "ldr x25, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x24, [x5, #0x100]\n"
- "ldr x23, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v1.8h\n"
- "ldr d17, [x21, x3]\n"
- "smlal2 v22.4s, v25.8h, v1.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v1.8h\n"
- "ldr d1, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x22, [x5, #0x110]\n"
- "ldr x21, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "subs x2, x2, #0x1\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr d3, [x5, #0x28]\n"
+ "ldr d2, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d4, [x5, #0x38]\n"
+ "ldr d22, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x23, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x28, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "ldr x27, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "ldr x26, [x4, #0x80]\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x25, [x4, #0x88]\n"
+ "ldr x24, [x4, #0x90]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x23, [x4, #0x98]\n"
+ "ldr x22, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x21, x2]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x20, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "usubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x21, [x4, #0xa8]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x28, x2]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x27, x2]\n"
+ "ldr x13, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x12, [x4, #0xc0]\n"
+ "ldr x11, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v3.4h\n"
+ "smlal2 v0.4s, v16.8h, v3.8h\n"
+ "ldr d16, [x25, x2]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x24, x2]\n"
+ "ldr x10, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal v1.4s, v18.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
+ "smlal2 v25.4s, v18.8h, v3.8h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x28, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v2.4h\n"
+ "smlal2 v0.4s, v20.8h, v2.8h\n"
+ "ldr d20, [x23, x2]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "ldr d3, [x5, #0x60]\n"
+ "ldr x27, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v2.4h\n"
+ "smlal v1.4s, v17.4h, v2.4h\n"
+ "ldr x26, [x4, #0xf0]\n"
+ "ldr x25, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v2.8h\n"
+ "smlal2 v25.4s, v17.8h, v2.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x24, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "ldr d19, [x22, x2]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v2.4h\n"
+ "smlal2 v30.4s, v26.8h, v2.8h\n"
+ "ldr d2, [x5, #0x68]\n"
+ "ldr x23, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v4.4h\n"
+ "smlal v1.4s, v26.4h, v4.4h\n"
+ "ldr x22, [x4, #0x110]\n"
+ "subs x14, x14, #0x1\n"
+ "smlal2 v6.4s, v12.8h, v4.8h\n"
+ "smlal2 v25.4s, v26.8h, v4.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v22.4h\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v11.8h, v4.8h\n"
+ "ldr d4, [x5, #0x70]\n"
+ "ldr x21, [x4, #0x118]\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal v1.4s, v11.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal2 v25.4s, v11.8h, v22.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "ldr d22, [x5, #0x78]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x13, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "usubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x12, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x11, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v10.4h, v27.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x10, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v2.4h\n"
- "smlal v23.4s, v17.4h, v2.4h\n"
- "smlal2 v15.4s, v10.8h, v27.8h\n"
- "smlal v7.4s, v9.4h, v1.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "ldr d10, [x9, x3]\n"
- "smlal2 v22.4s, v4.8h, v2.8h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v2.8h\n"
- "ldr d2, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v27.4h\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v27.4h\n"
- "smlal v23.4s, v6.4h, v27.4h\n"
- "smlal2 v15.4s, v9.8h, v1.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v27.8h\n"
- "ldr d9, [x28, x3]\n"
- "smlal2 v22.4s, v17.8h, v27.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v27.8h\n"
- "ldr d27, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v1.4h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x27, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v1.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v1.8h\n"
- "usubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "ldr d1, [x26, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "usubl v1.8h, v1.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x25, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x24, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "usubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v2.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x23, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "add x6, x6, #0xc8\n"
- "smlal2 v15.4s, v6.8h, v2.8h\n"
- "smlal v7.4s, v8.4h, v27.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x22, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal v20.4s, v28.4h, v2.4h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v2.4h\n"
- "smlal v23.4s, v12.4h, v2.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v27.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v2.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v2.8h\n"
- "smlal2 v19.4s, v12.8h, v2.8h\n"
- "ldr q2, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v27.4h\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x13, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x12, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x11, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v3.4h\n"
+ "smlal2 v0.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x10, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v3.4h\n"
+ "smlal v1.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v11.8h, v3.8h\n"
+ "smlal2 v25.4s, v19.8h, v3.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v2.4h\n"
+ "smlal2 v0.4s, v11.8h, v2.8h\n"
+ "ldr d11, [x9, x2]\n"
+ "usubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v3.4h\n"
+ "smlal2 v30.4s, v12.8h, v3.8h\n"
+ "ldr d3, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v2.4h\n"
+ "smlal v1.4s, v12.4h, v2.4h\n"
+ "smlal2 v6.4s, v28.8h, v2.8h\n"
+ "smlal2 v25.4s, v12.8h, v2.8h\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v0.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v2.4h\n"
+ "smlal2 v30.4s, v7.8h, v2.8h\n"
+ "ldr d2, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v1.4s, v7.4h, v4.4h\n"
+ "smlal2 v6.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x27, x2]\n"
+ "smlal2 v25.4s, v7.8h, v4.8h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v22.4h\n"
+ "smlal2 v0.4s, v23.8h, v22.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x26, x2]\n"
+ "smlal v27.4s, v20.4h, v22.4h\n"
+ "smlal v1.4s, v24.4h, v22.4h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v22.8h\n"
+ "smlal2 v25.4s, v24.8h, v22.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x25, x2]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v22.4h\n"
+ "smlal2 v30.4s, v17.8h, v22.8h\n"
+ "ldr d22, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x24, x2]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "add x5, x5, #0xc8\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v3.4h\n"
+ "smlal2 v0.4s, v7.8h, v3.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x21, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v3.4h\n"
+ "smlal v1.4s, v28.4h, v3.4h\n"
+ "smlal2 v6.4s, v29.8h, v3.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v3.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal2 v0.4s, v24.8h, v2.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v3.4h\n"
+ "smlal2 v30.4s, v16.8h, v3.8h\n"
+ "ldr q3, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v2.4h\n"
+ "smlal v1.4s, v4.4h, v2.4h\n"
+ "smlal2 v6.4s, v17.8h, v2.8h\n"
+ "smlal2 v25.4s, v4.8h, v2.8h\n"
+ "ldr q4, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v1.4h, v27.4h\n"
- "smlal v23.4s, v16.4h, v27.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v27.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v1.8h, v27.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v27.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v27.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v27.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v2.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v9.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v25.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v2.4s\n"
- "and v10.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v2.4s\n"
- "and v21.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v2.4s\n"
- "sqadd v15.4s, v15.4s, v9.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v9.16b, v5.16b, v14.16b\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "and v12.16b, v22.16b, v14.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v17.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v10.4s\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v2.4h\n"
+ "smlal2 v30.4s, v20.8h, v2.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal2 v0.4s, v26.8h, v22.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v22.4h\n"
+ "smlal v1.4s, v19.4h, v22.4h\n"
+ "smlal2 v6.4s, v11.8h, v22.8h\n"
+ "smlal2 v25.4s, v19.8h, v22.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v22.4h\n"
+ "smlal2 v30.4s, v12.8h, v22.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v12.16b, v0.16b, v4.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v3.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
+ "and v21.16b, v27.16b, v24.16b\n"
+ "and v16.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v28.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v12.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v25.16b, v4.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v30.16b, v4.16b\n"
+ "sqadd v27.4s, v27.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v12.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v17.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v3.4s\n"
+ "srshl v0.4s, v0.4s, v4.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v4.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "add x4, x4, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v14.8h, v14.8b, v13.8b\n"
- "ldr d31, [x9, x3]\n"
- "ldr d17, [x28, x3]\n"
- "usubl v10.8h, v10.8b, v13.8b\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "ldr d30, [x27, x3]\n"
- "ldr d16, [x26, x3]\n"
- "usubl v12.8h, v12.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "ldr d3, [x25, x3]\n"
- "ldr d4, [x24, x3]\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "ldr d25, [x23, x3]\n"
- "ldr d9, [x22, x3]\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "ldr d29, [x21, x3]\n"
- "ldr d28, [x20, x3]\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v12.8h, v12.8b, v9.8b\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "usubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "ldr d26, [x22, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr d29, [x21, x2]\n"
+ "ldr d18, [x20, x2]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr d27, [x6, #0x28]\n"
- "ldr d1, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d2, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldr x21, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x21, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v27.4h\n"
- "ldr x23, [x5, #0x90]\n"
- "ldr x22, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x20, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x21, [x5, #0xa0]\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v27.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x23, x3]\n"
- "smlal v20.4s, v16.4h, v27.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v27.4h\n"
- "smlal v23.4s, v14.4h, v27.4h\n"
- "ldr x13, [x5, #0xb0]\n"
- "ldr x12, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v1.8h\n"
- "smlal v7.4s, v4.4h, v2.4h\n"
- "ldr x11, [x5, #0xc0]\n"
- "ldr x10, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v27.8h\n"
- "ldr d16, [x22, x3]\n"
- "smlal2 v22.4s, v28.8h, v27.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v27.8h\n"
- "ldr d27, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v1.4h\n"
- "smlal v23.4s, v25.4h, v1.4h\n"
- "ldr x9, [x5, #0xd0]\n"
- "ldr x28, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v2.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x27, [x5, #0xe0]\n"
- "ldr x26, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v1.8h\n"
- "ldr d4, [x21, x3]\n"
- "smlal2 v22.4s, v14.8h, v1.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "ldr d1, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v2.4h\n"
- "smlal v23.4s, v10.4h, v2.4h\n"
- "ldr x25, [x5, #0xf0]\n"
- "ldr x24, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x23, [x5, #0x100]\n"
- "ldr x22, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v2.8h\n"
- "ldr d17, [x20, x3]\n"
- "smlal2 v22.4s, v25.8h, v2.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v2.8h\n"
- "ldr d2, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x21, [x5, #0x110]\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
+ "ldr d4, [x5, #0x28]\n"
+ "ldr d3, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "ldr d2, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x21, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x28, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x27, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x24, [x4, #0x80]\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x23, [x4, #0x88]\n"
+ "ldr x22, [x4, #0x90]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x27, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "usubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x14, [x4, #0xa8]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x13, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x26, x2]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x25, x2]\n"
+ "ldr x12, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "ldr x10, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x24, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x23, x2]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x22, x2]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal v1.4s, v18.4h, v4.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v4.8h\n"
+ "smlal2 v25.4s, v18.8h, v4.8h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "ldr d20, [x21, x2]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v4.4h\n"
+ "smlal2 v30.4s, v17.8h, v4.8h\n"
+ "ldr d4, [x5, #0x60]\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal v1.4s, v17.4h, v3.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal2 v25.4s, v17.8h, v3.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "ldr d19, [x20, x2]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v3.4h\n"
+ "smlal2 v30.4s, v26.8h, v3.8h\n"
+ "ldr d3, [x5, #0x68]\n"
+ "ldr x22, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v22.4h\n"
+ "smlal v1.4s, v26.4h, v22.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v6.4s, v12.8h, v22.8h\n"
+ "smlal2 v25.4s, v26.8h, v22.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
"tst x1, #0x7\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x13, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x12, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "usubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x11, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v27.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x10, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v27.8h\n"
- "smlal v7.4s, v10.4h, v1.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x9, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v27.4h\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v27.4h\n"
- "smlal v23.4s, v17.4h, v27.4h\n"
- "smlal2 v15.4s, v10.8h, v1.8h\n"
- "smlal v7.4s, v9.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v27.8h\n"
- "ldr d10, [x28, x3]\n"
- "smlal2 v22.4s, v4.8h, v27.8h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v27.8h\n"
- "ldr d27, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v1.4h\n"
- "usubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v1.4h\n"
- "smlal v23.4s, v6.4h, v1.4h\n"
- "smlal2 v15.4s, v9.8h, v2.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v1.8h\n"
- "ldr d9, [x27, x3]\n"
- "smlal2 v22.4s, v17.8h, v1.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v1.8h\n"
- "ldr d1, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v2.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x26, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v2.8h\n"
- "usubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v2.8h\n"
- "ldr d2, [x25, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "usubl v2.8h, v2.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x24, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x23, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "usubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v27.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "smlal2 v15.4s, v6.8h, v27.8h\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x21, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x20, x3]\n"
- "smlal v20.4s, v28.4h, v27.4h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v27.4h\n"
- "smlal v23.4s, v12.4h, v27.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v1.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v27.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v27.8h\n"
- "smlal2 v19.4s, v12.8h, v27.8h\n"
- "ldr q27, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v1.4h\n"
+ "smlal v8.4s, v12.4h, v2.4h\n"
+ "smlal2 v0.4s, v12.8h, v2.8h\n"
+ "ldr d12, [x14, x2]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "smlal v27.4s, v7.4h, v2.4h\n"
+ "smlal v1.4s, v11.4h, v2.4h\n"
+ "smlal2 v6.4s, v7.8h, v2.8h\n"
+ "smlal2 v25.4s, v11.8h, v2.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x13, x2]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v2.4h\n"
+ "smlal2 v30.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x5, #0x78]\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x12, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x11, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x10, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v0.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x9, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v4.4h\n"
+ "smlal v1.4s, v19.4h, v4.4h\n"
+ "smlal2 v6.4s, v11.8h, v4.8h\n"
+ "smlal2 v25.4s, v19.8h, v4.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v0.4s, v11.8h, v3.8h\n"
+ "ldr d11, [x28, x2]\n"
+ "usubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v4.4h\n"
+ "smlal2 v30.4s, v12.8h, v4.8h\n"
+ "ldr d4, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v3.4h\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "ldr d28, [x27, x2]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v3.4h\n"
+ "smlal2 v30.4s, v7.8h, v3.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v22.4h\n"
+ "smlal v1.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v16.8h, v22.8h\n"
+ "ldr d16, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v22.8h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal2 v0.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v22.4h\n"
+ "smlal2 v30.4s, v29.8h, v22.8h\n"
+ "ldr d22, [x25, x2]\n"
+ "smlal v27.4s, v20.4h, v2.4h\n"
+ "smlal v1.4s, v24.4h, v2.4h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v2.8h\n"
+ "smlal2 v25.4s, v24.8h, v2.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x24, x2]\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v2.4h\n"
+ "smlal2 v30.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x23, x2]\n"
+ "usubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x22, x2]\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v4.4h\n"
+ "smlal2 v0.4s, v7.8h, v4.8h\n"
+ "ldr d7, [x21, x2]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x20, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal v1.4s, v28.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v0.4s, v24.8h, v3.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
+ "ldr q4, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v3.4h\n"
+ "smlal v1.4s, v22.4h, v3.4h\n"
+ "smlal2 v6.4s, v17.8h, v3.8h\n"
+ "smlal2 v25.4s, v22.8h, v3.8h\n"
+ "ldr q22, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v2.4h, v1.4h\n"
- "smlal v23.4s, v16.4h, v1.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v1.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v2.8h, v1.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v1.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v4.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v4.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v27.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v30.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v3.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v27.4s\n"
- "and v25.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v27.4s\n"
- "and v16.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v27.4s\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v4.16b, v5.16b, v14.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v10.16b, v22.16b, v14.16b\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v3.4h\n"
+ "smlal2 v30.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal2 v0.4s, v26.8h, v2.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v2.4h\n"
+ "smlal v1.4s, v19.4h, v2.4h\n"
+ "smlal2 v6.4s, v11.8h, v2.8h\n"
+ "smlal2 v25.4s, v19.8h, v2.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v2.4h\n"
+ "smlal2 v30.4s, v12.8h, v2.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v4.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v28.16b, v0.16b, v22.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v16.16b, v27.16b, v24.16b\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v11.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v28.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v25.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
+ "and v18.16b, v6.16b, v22.16b\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v4.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v12.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "add x4, x4, #0x8\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v11.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
"beq 124f\n"
- "add x6, x6, #0xc8\n"
+ "add x5, x5, #0xc8\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v7.4s }, [x20], #0x10\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v0.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[2], [x20]\n"
+ "ld1 { v0.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[0], [x20]\n"
+ "ld1 { v0.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[2], [x20]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[0], [x20]\n"
+ "ld1 { v8.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "usubl v6.8h, v6.8b, v13.8b\n"
- "usubl v14.8h, v14.8b, v13.8b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "usubl v10.8h, v10.8b, v13.8b\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "usubl v12.8h, v12.8b, v13.8b\n"
- "add x9, x9, x3\n"
- "add x28, x28, x3\n"
- "add x27, x27, x3\n"
- "add x26, x26, x3\n"
- "add x25, x25, x3\n"
- "add x24, x24, x3\n"
- "add x23, x23, x3\n"
- "add x22, x22, x3\n"
- "add x21, x21, x3\n"
- "add x20, x20, x3\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v12.8h, v12.8b, v9.8b\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "usubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "add x9, x9, x2\n"
+ "add x28, x28, x2\n"
+ "add x27, x27, x2\n"
+ "add x26, x26, x2\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "add x25, x25, x2\n"
+ "add x24, x24, x2\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "add x23, x23, x2\n"
+ "add x22, x22, x2\n"
+ "add x21, x21, x2\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 9f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v17.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "ld1 { v16.s }[0], [x26], #0x4\n"
- "ld1 { v3.s }[0], [x25], #0x4\n"
- "ld1 { v4.s }[0], [x24], #0x4\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "ld1 { v21.s }[0], [x28], #0x4\n"
+ "ld1 { v16.s }[0], [x27], #0x4\n"
+ "ld1 { v20.s }[0], [x26], #0x4\n"
+ "ld1 { v7.s }[0], [x25], #0x4\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
"ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v17.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v16.h }[2], [x26], #0x2\n"
- "ld1 { v3.h }[2], [x25], #0x2\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v21.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v20.h }[2], [x26], #0x2\n"
+ "ld1 { v7.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
"ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[6], [x9]\n"
- "ld1 { v17.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v16.b }[6], [x26]\n"
- "ld1 { v3.b }[6], [x25]\n"
- "ld1 { v4.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "ld1 { v21.b }[6], [x28]\n"
+ "ld1 { v16.b }[6], [x27]\n"
+ "ld1 { v20.b }[6], [x26]\n"
+ "ld1 { v7.b }[6], [x25]\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[4], [x9]\n"
- "ld1 { v17.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v16.b }[4], [x26]\n"
- "ld1 { v3.b }[4], [x25]\n"
- "ld1 { v4.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "ld1 { v21.b }[4], [x28]\n"
+ "ld1 { v16.b }[4], [x27]\n"
+ "ld1 { v20.b }[4], [x26]\n"
+ "ld1 { v7.b }[4], [x25]\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
"ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v17.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "ld1 { v16.h }[0], [x26], #0x2\n"
- "ld1 { v3.h }[0], [x25], #0x2\n"
- "ld1 { v4.h }[0], [x24], #0x2\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "ld1 { v21.h }[0], [x28], #0x2\n"
+ "ld1 { v16.h }[0], [x27], #0x2\n"
+ "ld1 { v20.h }[0], [x26], #0x2\n"
+ "ld1 { v7.h }[0], [x25], #0x2\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
"ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[2], [x9]\n"
- "ld1 { v17.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v16.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "ld1 { v21.b }[2], [x28]\n"
+ "ld1 { v16.b }[2], [x27]\n"
+ "ld1 { v20.b }[2], [x26]\n"
+ "ld1 { v7.b }[2], [x25]\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[0], [x9]\n"
- "ld1 { v17.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
- "ld1 { v16.b }[0], [x26]\n"
- "ld1 { v3.b }[0], [x25]\n"
- "ld1 { v4.b }[0], [x24]\n"
- "ld1 { v25.b }[0], [x23]\n"
- "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "ld1 { v21.b }[0], [x28]\n"
+ "ld1 { v16.b }[0], [x27]\n"
+ "ld1 { v20.b }[0], [x26]\n"
+ "ld1 { v7.b }[0], [x25]\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
"ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "ldr x20, [x5, #0x50]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "add x20, x20, x3\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x50]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "add x20, x20, x2\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v4.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v4.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v4.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v4.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v18.8b\n"
- "ldr x20, [x5, #0x58]\n"
- "smlal v23.4s, v27.4h, v10.4h\n"
- "smlal2 v19.4s, v27.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "smlal v24.4s, v27.4h, v21.4h\n"
- "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal v5.4s, v4.4h, v17.4h\n"
+ "smlal2 v30.4s, v4.8h, v17.8h\n"
+ "smlal v1.4s, v4.4h, v11.4h\n"
+ "smlal2 v25.4s, v4.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 17f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v6.8h, v6.8b, v18.8b\n"
- "ldr x20, [x5, #0x60]\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x60]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "smlal v5.4s, v21.4h, v11.4h\n"
+ "smlal2 v30.4s, v21.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 21f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d14, [x6, #0x28]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "usubl v14.8h, v14.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v14.4h\n"
- "smlal2 v15.4s, v30.8h, v14.8h\n"
- "smlal v20.4s, v16.4h, v14.4h\n"
- "smlal2 v5.4s, v16.8h, v14.8h\n"
- "smlal v24.4s, v28.4h, v14.4h\n"
- "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "ldr d11, [x5, #0x28]\n"
+ "usubl v31.8h, v31.8b, v15.8b\n"
+ "smlal v1.4s, v21.4h, v23.4h\n"
+ "smlal2 v25.4s, v21.8h, v23.8h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "smlal v27.4s, v31.4h, v23.4h\n"
+ "smlal2 v6.4s, v31.8h, v23.8h\n"
+ "usubl v11.8h, v11.8b, v9.8b\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "smlal v1.4s, v18.4h, v11.4h\n"
+ "smlal2 v25.4s, v18.8h, v11.8h\n"
+ "smlal v27.4s, v20.4h, v11.4h\n"
+ "smlal2 v6.4s, v20.8h, v11.8h\n"
"tbz x1, #2, 25f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x1, #1, 26f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d21, [x6, #0x30]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x70]\n"
- "smlal v23.4s, v25.4h, v14.4h\n"
- "smlal2 v19.4s, v25.8h, v14.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v16.8h, v21.8h\n"
- "smlal v20.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v4.8h, v21.8h\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ldr d3, [x5, #0x30]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "ldr x20, [x4, #0x70]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v24.4h, v11.4h\n"
+ "smlal2 v30.4s, v24.8h, v11.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal v1.4s, v24.4h, v3.4h\n"
+ "smlal2 v25.4s, v24.8h, v3.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d9, [x6, #0x38]\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v4.4h, v9.4h\n"
- "smlal2 v15.4s, v4.8h, v9.8h\n"
- "smlal v20.4s, v27.4h, v9.4h\n"
- "smlal2 v5.4s, v27.8h, v9.8h\n"
- "smlal v24.4s, v10.4h, v9.4h\n"
- "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x20, [x4, #0x78]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v2.4h, v3.4h\n"
+ "smlal2 v30.4s, v2.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "smlal v27.4s, v4.4h, v22.4h\n"
+ "smlal2 v6.4s, v4.8h, v22.8h\n"
+ "smlal v1.4s, v2.4h, v22.4h\n"
+ "smlal2 v25.4s, v2.8h, v22.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v12.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v12.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v12.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d31, [x6, #0x40]\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "usubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal v23.4s, v12.4h, v9.4h\n"
- "smlal2 v19.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v27.4h, v31.4h\n"
- "smlal2 v15.4s, v27.8h, v31.8h\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "smlal v24.4s, v12.4h, v31.4h\n"
- "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "ldr d31, [x5, #0x40]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0x80]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v22.4h\n"
+ "smlal2 v30.4s, v26.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v4.4h, v31.4h\n"
+ "smlal2 v0.4s, v4.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d16, [x6, #0x48]\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0x88]\n"
- "smlal v23.4s, v8.4h, v31.4h\n"
- "smlal2 v19.4s, v8.8h, v31.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v6.4h, v16.4h\n"
- "smlal2 v15.4s, v6.8h, v16.8h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal2 v5.4s, v29.8h, v16.8h\n"
- "smlal v24.4s, v8.4h, v16.4h\n"
- "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "ldr d17, [x5, #0x48]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0x88]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v31.4h\n"
+ "smlal2 v30.4s, v28.8h, v31.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v29.4h, v17.4h\n"
+ "smlal2 v6.4s, v29.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v7.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v7.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v7.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v7.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v7.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v7.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d21, [x6, #0x50]\n"
- "usubl v27.8h, v27.8b, v18.8b\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x90]\n"
- "smlal v23.4s, v27.4h, v16.4h\n"
- "smlal2 v19.4s, v27.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "smlal v20.4s, v25.4h, v21.4h\n"
- "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "ldr d22, [x5, #0x50]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr x20, [x4, #0x90]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v17.4h\n"
+ "smlal2 v30.4s, v7.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v22.4h\n"
+ "smlal2 v0.4s, v18.8h, v22.8h\n"
+ "smlal v27.4s, v24.4h, v22.4h\n"
+ "smlal2 v6.4s, v24.8h, v22.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v18.8b\n"
- "ldr x20, [x5, #0x98]\n"
- "smlal v24.4s, v31.4h, v21.4h\n"
- "smlal2 v22.4s, v31.8h, v21.8h\n"
- "add x20, x20, x3\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0x98]\n"
+ "smlal v1.4s, v20.4h, v22.4h\n"
+ "smlal2 v25.4s, v20.8h, v22.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 49f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d2, [x6, #0x58]\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0xa0]\n"
- "smlal v23.4s, v28.4h, v21.4h\n"
- "smlal2 v19.4s, v28.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ldr d17, [x5, #0x58]\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v22.4h\n"
+ "smlal2 v30.4s, v19.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v24.4h, v17.4h\n"
+ "smlal2 v0.4s, v24.8h, v17.8h\n"
+ "smlal v27.4s, v2.4h, v17.4h\n"
+ "smlal2 v6.4s, v2.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d25, [x6, #0x60]\n"
- "usubl v21.8h, v21.8b, v18.8b\n"
- "usubl v25.8h, v25.8b, v13.8b\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal v23.4s, v21.4h, v2.4h\n"
- "smlal2 v19.4s, v21.8h, v2.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v10.4h, v25.4h\n"
- "smlal2 v15.4s, v10.8h, v25.8h\n"
- "smlal v20.4s, v12.4h, v25.4h\n"
- "smlal2 v5.4s, v12.8h, v25.8h\n"
- "smlal v24.4s, v21.4h, v25.4h\n"
- "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "ldr d24, [x5, #0x60]\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa8]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v17.4h\n"
+ "smlal2 v30.4s, v29.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v2.4h, v24.4h\n"
+ "smlal2 v0.4s, v2.8h, v24.8h\n"
+ "smlal v27.4s, v26.4h, v24.4h\n"
+ "smlal2 v6.4s, v26.8h, v24.8h\n"
+ "smlal v1.4s, v29.4h, v24.4h\n"
+ "smlal2 v25.4s, v29.8h, v24.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d1, [x6, #0x68]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0xb0]\n"
- "smlal v23.4s, v9.4h, v25.4h\n"
- "smlal2 v19.4s, v9.8h, v25.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v12.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v1.8h\n"
- "smlal v20.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v8.8h, v1.8h\n"
- "smlal v24.4s, v9.4h, v1.4h\n"
- "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "ldr d17, [x5, #0x68]\n"
+ "usubl v31.8h, v31.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v31.4h, v24.4h\n"
+ "smlal2 v30.4s, v31.8h, v24.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v17.4h\n"
+ "smlal2 v0.4s, v26.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v31.4h, v17.4h\n"
+ "smlal2 v25.4s, v31.8h, v17.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v3.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v3.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v3.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d16, [x6, #0x70]\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0xb8]\n"
- "smlal v23.4s, v3.4h, v1.4h\n"
- "smlal2 v19.4s, v3.8h, v1.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "smlal2 v15.4s, v8.8h, v16.8h\n"
- "smlal v20.4s, v27.4h, v16.4h\n"
- "smlal2 v5.4s, v27.8h, v16.8h\n"
- "smlal v24.4s, v3.4h, v16.4h\n"
- "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v21.4h, v17.4h\n"
+ "smlal2 v30.4s, v21.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal v1.4s, v21.4h, v22.4h\n"
+ "smlal2 v25.4s, v21.8h, v22.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v14.s }[0], [x20], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v14.h }[2], [x20], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[6], [x20]\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[4], [x20]\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v14.h }[0], [x20], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[2], [x20]\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[0], [x20]\n"
+ "ld1 { v11.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d17, [x6, #0x78]\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v13.8b\n"
- "ldr x20, [x5, #0xc0]\n"
- "smlal v23.4s, v14.4h, v16.4h\n"
- "smlal2 v19.4s, v14.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v31.4h, v17.4h\n"
- "smlal2 v15.4s, v31.8h, v17.8h\n"
- "smlal v20.4s, v28.4h, v17.4h\n"
- "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "ldr d17, [x5, #0x78]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc0]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v17.4h\n"
+ "smlal2 v0.4s, v20.8h, v17.8h\n"
+ "smlal v27.4s, v19.4h, v17.4h\n"
+ "smlal2 v6.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v1.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v1.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v1.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v1.8h, v1.8b, v18.8b\n"
- "ldr x20, [x5, #0xc8]\n"
- "smlal v24.4s, v1.4h, v17.4h\n"
- "smlal2 v22.4s, v1.8h, v17.8h\n"
- "add x20, x20, x3\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc8]\n"
+ "smlal v1.4s, v18.4h, v17.4h\n"
+ "smlal2 v25.4s, v18.8h, v17.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 73f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d29, [x6, #0x80]\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "usubl v29.8h, v29.8b, v13.8b\n"
- "ldr x20, [x5, #0xd0]\n"
- "smlal v23.4s, v16.4h, v17.4h\n"
- "smlal2 v19.4s, v16.8h, v17.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v29.4h\n"
- "smlal2 v15.4s, v28.8h, v29.8h\n"
- "smlal v20.4s, v21.4h, v29.4h\n"
- "smlal2 v5.4s, v21.8h, v29.8h\n"
- "smlal v24.4s, v16.4h, v29.4h\n"
- "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "ldr d4, [x5, #0x80]\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd0]\n"
+ "usubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v20.4h, v17.4h\n"
+ "smlal2 v30.4s, v20.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "smlal v1.4s, v20.4h, v4.4h\n"
+ "smlal2 v25.4s, v20.8h, v4.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d12, [x6, #0x88]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "usubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0xd8]\n"
- "smlal v23.4s, v30.4h, v29.4h\n"
- "smlal2 v19.4s, v30.8h, v29.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v21.4h, v12.4h\n"
- "smlal2 v15.4s, v21.8h, v12.8h\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v30.4h, v12.4h\n"
- "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "ldr d17, [x5, #0x88]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd8]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v4.4h\n"
+ "smlal2 v30.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v29.4h, v17.4h\n"
+ "smlal2 v0.4s, v29.8h, v17.8h\n"
+ "smlal v27.4s, v31.4h, v17.4h\n"
+ "smlal2 v6.4s, v31.8h, v17.8h\n"
+ "smlal v1.4s, v26.4h, v17.4h\n"
+ "smlal2 v25.4s, v26.8h, v17.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d21, [x6, #0x90]\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0xe0]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal v20.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v3.8h, v21.8h\n"
- "smlal v24.4s, v29.4h, v21.4h\n"
- "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "ldr d22, [x5, #0x90]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe0]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v31.4h, v22.4h\n"
+ "smlal2 v0.4s, v31.8h, v22.8h\n"
+ "smlal v27.4s, v21.4h, v22.4h\n"
+ "smlal2 v6.4s, v21.8h, v22.8h\n"
+ "smlal v1.4s, v23.4h, v22.4h\n"
+ "smlal2 v25.4s, v23.8h, v22.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d8, [x6, #0x98]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0xe8]\n"
- "smlal v23.4s, v25.4h, v21.4h\n"
- "smlal2 v19.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v3.4h, v8.4h\n"
- "smlal2 v15.4s, v3.8h, v8.8h\n"
- "smlal v20.4s, v14.4h, v8.4h\n"
- "smlal2 v5.4s, v14.8h, v8.8h\n"
- "smlal v24.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "ldr d17, [x5, #0x98]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe8]\n"
+ "usubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v11.4h, v17.4h\n"
+ "smlal2 v6.4s, v11.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d9, [x6, #0xa0]\n"
- "usubl v21.8h, v21.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0xf0]\n"
- "smlal v23.4s, v21.4h, v8.4h\n"
- "smlal2 v19.4s, v21.8h, v8.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v1.4h, v9.4h\n"
- "smlal2 v15.4s, v1.8h, v9.8h\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf0]\n"
+ "usubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v17.4h\n"
+ "smlal2 v30.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v3.4h\n"
+ "smlal2 v0.4s, v18.8h, v3.8h\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
"tbz x1, #2, 93f\n"
"ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
@@ -1871,308 +1871,308 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 95f\n"
"ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v12.8h, v12.8b, v18.8b\n"
- "ldr x20, [x5, #0xf8]\n"
- "smlal v24.4s, v12.4h, v9.4h\n"
- "smlal2 v22.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf8]\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 97f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d12, [x6, #0xa8]\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "usubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0x100]\n"
- "smlal v23.4s, v10.4h, v9.4h\n"
- "smlal2 v19.4s, v10.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v12.4h\n"
- "smlal2 v15.4s, v16.8h, v12.8h\n"
- "smlal v20.4s, v30.4h, v12.4h\n"
- "smlal2 v5.4s, v30.8h, v12.8h\n"
- "smlal v24.4s, v10.4h, v12.4h\n"
- "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "ldr d18, [x5, #0xa8]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x100]\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "smlal v27.4s, v26.4h, v18.4h\n"
+ "smlal2 v6.4s, v26.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d28, [x6, #0xb0]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "usubl v28.8h, v28.8b, v13.8b\n"
- "ldr x20, [x5, #0x108]\n"
- "smlal v23.4s, v9.4h, v12.4h\n"
- "smlal2 v19.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v28.4h\n"
- "smlal2 v15.4s, v30.8h, v28.8h\n"
- "smlal v20.4s, v29.4h, v28.4h\n"
- "smlal2 v5.4s, v29.8h, v28.8h\n"
- "smlal v24.4s, v9.4h, v28.4h\n"
- "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "ldr d12, [x5, #0xb0]\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0x108]\n"
+ "usubl v12.8h, v12.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v18.4h\n"
+ "smlal2 v30.4s, v19.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v12.4h\n"
+ "smlal2 v0.4s, v26.8h, v12.8h\n"
+ "smlal v27.4s, v23.4h, v12.4h\n"
+ "smlal2 v6.4s, v23.8h, v12.8h\n"
+ "smlal v1.4s, v19.4h, v12.4h\n"
+ "smlal2 v25.4s, v19.8h, v12.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v2.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v2.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v2.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d30, [x6, #0xb8]\n"
- "usubl v2.8h, v2.8b, v18.8b\n"
- "usubl v30.8h, v30.8b, v13.8b\n"
- "ldr x20, [x5, #0x110]\n"
- "smlal v23.4s, v2.4h, v28.4h\n"
- "smlal2 v19.4s, v2.8h, v28.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v29.4h, v30.4h\n"
- "smlal2 v15.4s, v29.8h, v30.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal v24.4s, v2.4h, v30.4h\n"
- "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x110]\n"
+ "usubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v12.4h\n"
+ "smlal2 v30.4s, v17.8h, v12.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v23.4h, v18.4h\n"
+ "smlal2 v0.4s, v23.8h, v18.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d8, [x6, #0xc0]\n"
- "usubl v27.8h, v27.8b, v18.8b\n"
- "usubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal v23.4s, v27.4h, v30.4h\n"
- "smlal2 v19.4s, v27.8h, v30.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v8.4h\n"
- "smlal2 v15.4s, v25.8h, v8.8h\n"
- "smlal v20.4s, v21.4h, v8.4h\n"
- "smlal2 v5.4s, v21.8h, v8.8h\n"
- "smlal v24.4s, v27.4h, v8.4h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d26, [x5, #0xc0]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v5.4s, v3.4h, v18.4h\n"
+ "smlal2 v30.4s, v3.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v26.4h\n"
+ "smlal2 v0.4s, v28.8h, v26.8h\n"
+ "smlal v27.4s, v16.4h, v26.4h\n"
+ "smlal2 v6.4s, v16.8h, v26.8h\n"
+ "smlal v1.4s, v3.4h, v26.4h\n"
+ "smlal2 v25.4s, v3.8h, v26.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal v23.4s, v9.4h, v8.4h\n"
- "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v26.4h\n"
+ "smlal2 v30.4s, v17.8h, v26.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v30.4s }, [x7], #0x10\n"
- "ld1 { v12.4s }, [x8], #0x10\n"
+ "ld1 { v9.4s }, [x6], #0x10\n"
+ "ld1 { v20.4s }, [x7], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v14.d }[0], [x7], #0x8\n"
- "ld1 { v27.d }[0], [x8], #0x8\n"
+ "ld1 { v18.d }[0], [x6], #0x8\n"
+ "ld1 { v3.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[2], [x7]\n"
- "ld1 { v27.s }[2], [x8]\n"
+ "ld1 { v18.s }[2], [x6]\n"
+ "ld1 { v3.s }[2], [x7]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[0], [x7]\n"
- "ld1 { v27.s }[0], [x8]\n"
+ "ld1 { v18.s }[0], [x6]\n"
+ "ld1 { v3.s }[0], [x7]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v30.d }[0], [x7], #0x8\n"
- "ld1 { v12.d }[0], [x8], #0x8\n"
+ "ld1 { v9.d }[0], [x6], #0x8\n"
+ "ld1 { v20.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[2], [x7]\n"
- "ld1 { v12.s }[2], [x8]\n"
+ "ld1 { v9.s }[2], [x6]\n"
+ "ld1 { v20.s }[2], [x7]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[0], [x7]\n"
- "ld1 { v12.s }[0], [x8]\n"
+ "ld1 { v9.s }[0], [x6]\n"
+ "ld1 { v20.s }[0], [x7]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v7.4s, v7.4s, v30.4s\n"
- "and v16.16b, v7.16b, v12.16b\n"
- "add x17, x17, x4\n"
- "add x16, x16, x4\n"
- "sqrdmulh v15.4s, v15.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add x15, x15, x4\n"
- "add x14, x14, x4\n"
- "and v2.16b, v15.16b, v27.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "sqrdmulh v24.4s, v24.4s, v30.4s\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "sqadd v7.4s, v7.4s, v16.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v21.16b, v20.16b, v12.16b\n"
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v18.16b, v24.16b, v12.16b\n"
- "sqrdmulh v22.4s, v22.4s, v14.4s\n"
- "and v31.16b, v23.16b, v12.16b\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v9.16b, v5.16b, v27.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v22.16b, v27.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v28.16b, v19.16b, v27.16b\n"
- "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v9.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "add x8, x8, x3\n"
+ "add x17, x17, x3\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v9.4s\n"
+ "add x16, x16, x3\n"
+ "add x15, x15, x3\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "and v17.16b, v8.16b, v20.16b\n"
+ "and v23.16b, v0.16b, v3.16b\n"
+ "and v9.16b, v27.16b, v20.16b\n"
+ "and v26.16b, v1.16b, v20.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
"sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v31.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v12.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v12.4s\n"
- "sqadd v22.4s, v22.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "srshl v15.4s, v15.4s, v27.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v27.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v27.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v27.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "and v24.16b, v6.16b, v3.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v18.16b, v25.16b, v3.16b\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v17.16b, v5.16b, v20.16b\n"
+ "sqadd v0.4s, v0.4s, v23.4s\n"
+ "and v16.16b, v30.16b, v3.16b\n"
+ "sqadd v27.4s, v27.4s, v9.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v26.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v24.4s\n"
+ "srshl v1.4s, v1.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "srshl v5.4s, v5.4s, v20.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v3.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v7.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x16], #0x4\n"
- "st1 { v24.s }[0], [x15], #0x4\n"
- "st1 { v23.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x8], #0x4\n"
+ "st1 { v27.s }[0], [x17], #0x4\n"
+ "st1 { v1.s }[0], [x16], #0x4\n"
+ "st1 { v5.s }[0], [x15], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v7.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x16], #0x2\n"
- "st1 { v24.h }[2], [x15], #0x2\n"
- "st1 { v23.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x8], #0x2\n"
+ "st1 { v27.h }[2], [x17], #0x2\n"
+ "st1 { v1.h }[2], [x16], #0x2\n"
+ "st1 { v5.h }[2], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x16], #0x1\n"
- "st1 { v24.b }[6], [x15], #0x1\n"
- "st1 { v23.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x8], #0x1\n"
+ "st1 { v27.b }[6], [x17], #0x1\n"
+ "st1 { v1.b }[6], [x16], #0x1\n"
+ "st1 { v5.b }[6], [x15], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x16], #0x1\n"
- "st1 { v24.b }[4], [x15], #0x1\n"
- "st1 { v23.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x8], #0x1\n"
+ "st1 { v27.b }[4], [x17], #0x1\n"
+ "st1 { v1.b }[4], [x16], #0x1\n"
+ "st1 { v5.b }[4], [x15], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v7.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x16], #0x2\n"
- "st1 { v24.h }[0], [x15], #0x2\n"
- "st1 { v23.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x8], #0x2\n"
+ "st1 { v27.h }[0], [x17], #0x2\n"
+ "st1 { v1.h }[0], [x16], #0x2\n"
+ "st1 { v5.h }[0], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x16], #0x1\n"
- "st1 { v24.b }[2], [x15], #0x1\n"
- "st1 { v23.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x8], #0x1\n"
+ "st1 { v27.b }[2], [x17], #0x1\n"
+ "st1 { v1.b }[2], [x16], #0x1\n"
+ "st1 { v5.b }[2], [x15], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x16], #0x1\n"
- "st1 { v24.b }[0], [x15], #0x1\n"
- "st1 { v23.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x8], #0x1\n"
+ "st1 { v27.b }[0], [x17], #0x1\n"
+ "st1 { v1.b }[0], [x16], #0x1\n"
+ "st1 { v5.b }[0], [x15], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index f7aa889b56..0641563b63 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,21 +45,21 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v7.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v7.4s }, [x21]\n"
"ld1r { v6.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v5.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v5.16b }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v3.4s }, [x21]\n"
"ld1r { v2.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
+ "ld1r { v1.4s }, [x20]\n"
"cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
@@ -68,75 +68,75 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x25, %x[inptrs]\n"
- "ldp x21, x20, [x25], #0x10\n"
- "subs x24, %x[n_points], #0x1\n"
- "ldr s14, [x21, x11]\n"
- "ldr s15, [x20, x11]\n"
+ "mov x23, %x[inptrs]\n"
+ "subs x22, %x[n_points], #0x1\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x21, x11]\n"
- "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s20, [x21, x11]\n"
"usubl v0.8h, v0.8b, v5.8b\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"usubl v20.8h, v20.8b, v6.8b\n"
"usubl v21.8h, v21.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x23, x22, [x25], #0x10\n"
- "ldp x21, x20, [x25], #0x10\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x23, x11]\n"
- "ldr s15, [x22, x11]\n"
+ "subs x22, x22, #0x1\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x21, x11]\n"
- "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s18, [x21, x11]\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x21, x11]\n"
- "usubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"usubl v20.8h, v20.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"usubl v21.8h, v21.8b, v6.8b\n"
"usubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
@@ -162,27 +162,27 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
@@ -254,17 +254,17 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s23, [x28, x11]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s24, [x27, x11]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s25, [x26, x11]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s26, [x25, x11]\n"
+ "str s23, [x28, x11]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x11]\n"
+ "str s25, [x26, x11]\n"
+ "str s26, [x25, x11]\n"
"str s27, [x24, x11]\n"
"str s28, [x23, x11]\n"
"str s29, [x22, x11]\n"
@@ -290,24 +290,24 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x27, x26, [x10], #0x10\n"
- "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x23, x22, [x10], #0x10\n"
- "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v30.16b, v23.16b\n"
- "add x9, x9, x11\n"
- "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"usubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "ldp x25, x24, [x10], #0x10\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
@@ -358,27 +358,27 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ble 15f\n"
"12:" // Oddments: Planar loop
"ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x21, [x10], #0x8\n"
- "add x9, x9, x11\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
+ "add x9, x9, x11\n"
"add x28, x28, x11\n"
- "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
- "usubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
@@ -465,36 +465,36 @@ void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
"add x28, x28, x11\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
"add x26, x26, x11\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
"add x24, x24, x11\n"
- "sshl v27.4s, v27.4s, v3.4s\n"
- "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x22, x22, x11\n"
- "sshl v29.4s, v29.4s, v3.4s\n"
- "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
"add x20, x20, x11\n"
- "sshl v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v2.4s\n"
- "sqrdmulh v27.4s, v27.4s, v2.4s\n"
- "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index d69f391514..24831a7153 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,162 +41,162 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q11, [%x[params], #0x0]\n"
+ "ldr q14, [%x[params], #0x0]\n"
"ldr q5, [%x[params], #0x10]\n"
- "movi v8.16b, #0x1\n"
- "ushr v8.4s, v8.4s, #0x8\n"
+ "movi v18.16b, #0x1\n"
+ "movi v24.4s, #0x0\n"
"ldr q6, [%x[params], #0x20]\n"
"ldr q7, [%x[params], #0x30]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "ld1 { v1.16b }, [x20]\n"
- "mov v28.16b, v1.16b\n"
- "mov v23.16b, v1.16b\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ld1 { v2.16b }, [x20]\n"
- "mov v30.16b, v1.16b\n"
- "mov v21.16b, v2.16b\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "ld1 { v4.16b }, [x20]\n"
- "mov v20.16b, v2.16b\n"
- "mov v29.16b, v2.16b\n"
- "ldr x20, [%x[inptrs], #0x0]\n"
- "ld1 { v0.16b }, [x20]\n"
- "mov v9.16b, v4.16b\n"
- "mov v22.16b, v4.16b\n"
- "ldr x20, [%x[inptrs], #0x18]\n"
- "ld1 { v3.16b }, [x20]\n"
- "mov v31.16b, v4.16b\n"
- "ext v28.16b, v28.16b, v28.16b, #0x2\n"
- "ext v23.16b, v23.16b, v23.16b, #0x4\n"
- "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "movi v28.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "ldr x25, [%x[inptrs], #0x8]\n"
+ "ldr x24, [%x[inptrs], #0x10]\n"
+ "ushr v18.4s, v18.4s, #0x8\n"
+ "movi v27.4s, #0x0\n"
+ "ldr x23, [%x[inptrs], #0x20]\n"
+ "ldr x22, [%x[inptrs], #0x0]\n"
+ "movi v21.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x21, [%x[inptrs], #0x18]\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
"add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.4s }, [x20]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x2\n"
- "ext v20.16b, v20.16b, v20.16b, #0x4\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.4s }, [x20]\n"
- "ext v29.16b, v29.16b, v29.16b, #0x6\n"
- "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+ "ld1 { v1.16b }, [x25]\n"
+ "ld1 { v2.16b }, [x24]\n"
+ "movi v23.4s, #0x0\n"
+ "movi v8.4s, #0x0\n"
+ "ld1 { v4.16b }, [x23]\n"
+ "ld1 { v0.16b }, [x22]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "ld1 { v3.16b }, [x21]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "mov v31.16b, v1.16b\n"
+ "mov v9.16b, v1.16b\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_c_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x4\n"
- "ext v31.16b, v31.16b, v31.16b, #0x6\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v15.4s }, [x20]\n"
- "mov v27.16b, v0.16b\n"
- "mov v19.16b, v0.16b\n"
+ "ld1r { v11.4s }, [x21]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "mov v16.16b, v1.16b\n"
+ "mov v30.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x2\n"
+ "add x10, %x[qp], %[offsetof_Requantize32_maxval]\n"
"cmp %x[n_channels], #0x4\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x4\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
"mov x9, #0x0\n"
- "mov v18.16b, v0.16b\n"
- "mov v26.16b, v3.16b\n"
"mov x28, #0x0\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x2\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x4\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "mov v17.16b, v3.16b\n"
- "mov v16.16b, v3.16b\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "neg v19.4s, v19.4s\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "ext v27.16b, v27.16b, v27.16b, #0x2\n"
- "ext v19.16b, v19.16b, v19.16b, #0x4\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
"add %x[params], %x[params], #0x40\n"
- "ext v18.16b, v18.16b, v18.16b, #0x6\n"
- "zip1 v1.4s, v1.4s, v23.4s\n"
- "zip1 v28.4s, v28.4s, v30.4s\n"
- "zip1 v2.4s, v2.4s, v20.4s\n"
- "zip1 v21.4s, v21.4s, v29.4s\n"
- "ext v26.16b, v26.16b, v26.16b, #0x2\n"
- "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "zip1 v1.4s, v1.4s, v9.4s\n"
+ "ld1r { v9.4s }, [x10]\n"
+ "zip1 v31.4s, v31.4s, v16.4s\n"
+ "mov v16.16b, v2.16b\n"
+ "zip1 v2.4s, v2.4s, v29.4s\n"
+ "mov v29.16b, v4.16b\n"
"ext v16.16b, v16.16b, v16.16b, #0x6\n"
- "zip1 v4.4s, v4.4s, v22.4s\n"
- "zip1 v9.4s, v9.4s, v31.4s\n"
- "zip1 v0.4s, v0.4s, v19.4s\n"
- "zip1 v27.4s, v27.4s, v18.4s\n"
- "zip1 v1.4s, v1.4s, v28.4s\n"
- "zip1 v2.4s, v2.4s, v21.4s\n"
- ".inst 0x6f81e118 // udot v24.4s, v8.16b, v1.4b[0]\n"
- "zip1 v3.4s, v3.4s, v17.4s\n"
- "zip1 v26.4s, v26.4s, v16.4s\n"
- ".inst 0x6fa1e119 // udot v25.4s, v8.16b, v1.4b[1]\n"
- "zip1 v4.4s, v4.4s, v9.4s\n"
- "movi v23.4s, #0x0\n"
- ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
- "movi v22.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- ".inst 0x6fa1e916 // udot v22.4s, v8.16b, v1.4b[3]\n"
- "movi v19.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- ".inst 0x6f82e115 // udot v21.4s, v8.16b, v2.4b[0]\n"
- "movi v10.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- ".inst 0x6fa2e113 // udot v19.4s, v8.16b, v2.4b[1]\n"
- "movi v18.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6f82e909 // udot v9.4s, v8.16b, v2.4b[2]\n"
- "movi v16.4s, #0x0\n"
- "zip1 v0.4s, v0.4s, v27.4s\n"
- ".inst 0x6fa2e90a // udot v10.4s, v8.16b, v2.4b[3]\n"
- "zip1 v3.4s, v3.4s, v26.4s\n"
- ".inst 0x6f84e114 // udot v20.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6fa4e112 // udot v18.4s, v8.16b, v4.4b[1]\n"
- ".inst 0x6f84e911 // udot v17.4s, v8.16b, v4.4b[2]\n"
- ".inst 0x6fa4e910 // udot v16.4s, v8.16b, v4.4b[3]\n"
- "movi v31.4s, #0x0\n"
+ "zip1 v1.4s, v1.4s, v31.4s\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x4\n"
+ "zip1 v30.4s, v30.4s, v16.4s\n"
+ "mov v16.16b, v4.16b\n"
+ ".inst 0x6f81e258 // udot v24.4s, v18.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e25c // udot v28.4s, v18.16b, v1.4b[1]\n"
+ ".inst 0x6f81ea5a // udot v26.4s, v18.16b, v1.4b[2]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v31.4s\n"
+ "mov v31.16b, v0.16b\n"
+ ".inst 0x6fa1ea5b // udot v27.4s, v18.16b, v1.4b[3]\n"
+ "zip1 v2.4s, v2.4s, v30.4s\n"
+ "mov v30.16b, v0.16b\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x2\n"
+ "zip1 v29.4s, v29.4s, v16.4s\n"
+ "mov v16.16b, v0.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x4\n"
+ ".inst 0x6f82e255 // udot v21.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e24c // udot v12.4s, v18.16b, v2.4b[1]\n"
+ ".inst 0x6f82ea4d // udot v13.4s, v18.16b, v2.4b[2]\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v29.4s\n"
+ "mov v29.16b, v3.16b\n"
+ ".inst 0x6fa2ea4f // udot v15.4s, v18.16b, v2.4b[3]\n"
+ "zip1 v0.4s, v0.4s, v30.4s\n"
+ "mov v30.16b, v3.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x2\n"
+ "zip1 v31.4s, v31.4s, v16.4s\n"
+ "mov v16.16b, v3.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x4\n"
+ ".inst 0x6f84e257 // udot v23.4s, v18.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e248 // udot v8.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6f84ea54 // udot v20.4s, v18.16b, v4.4b[2]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v0.4s, v0.4s, v31.4s\n"
+ ".inst 0x6fa4ea51 // udot v17.4s, v18.16b, v4.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v30.4s\n"
"movi v30.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- ".inst 0x6f80e11f // udot v31.4s, v8.16b, v0.4b[0]\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- ".inst 0x6fa0e11e // udot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v31.4s, #0x0\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "zip1 v29.4s, v29.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ ".inst 0x6f80e256 // udot v22.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e259 // udot v25.4s, v18.16b, v0.4b[1]\n"
+ ".inst 0x6f80ea5e // udot v30.4s, v18.16b, v0.4b[2]\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ ".inst 0x6fa0ea5f // udot v31.4s, v18.16b, v0.4b[3]\n"
+ "add v27.4s, v27.4s, v15.4s\n"
+ "zip1 v3.4s, v3.4s, v29.4s\n"
"movi v29.4s, #0x0\n"
- ".inst 0x6f80e91a // udot v26.4s, v8.16b, v0.4b[2]\n"
- ".inst 0x6fa0e91b // udot v27.4s, v8.16b, v0.4b[3]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6fa3e11d // udot v29.4s, v8.16b, v3.4b[1]\n"
- "add v24.4s, v24.4s, v21.4s\n"
- "add v25.4s, v25.4s, v19.4s\n"
- "add v23.4s, v23.4s, v9.4s\n"
- "add v22.4s, v22.4s, v10.4s\n"
- "add v21.4s, v20.4s, v21.4s\n"
- "movi v20.4s, #0x0\n"
- ".inst 0x6f83e914 // udot v20.4s, v8.16b, v3.4b[2]\n"
- "add v19.4s, v18.4s, v19.4s\n"
- "movi v18.4s, #0x0\n"
- ".inst 0x6fa3e912 // udot v18.4s, v8.16b, v3.4b[3]\n"
- "add v17.4s, v17.4s, v9.4s\n"
- "add v16.4s, v16.4s, v10.4s\n"
- "add v24.4s, v24.4s, v31.4s\n"
- "add v25.4s, v25.4s, v30.4s\n"
- "add v26.4s, v23.4s, v26.4s\n"
- "add v27.4s, v22.4s, v27.4s\n"
- "add v28.4s, v21.4s, v28.4s\n"
- "add v29.4s, v19.4s, v29.4s\n"
- "add v30.4s, v17.4s, v20.4s\n"
- "add v31.4s, v16.4s, v18.4s\n"
- "neg v12.4s, v12.4s\n"
- "mul v24.4s, v24.4s, v12.4s\n"
- "mul v25.4s, v25.4s, v12.4s\n"
- "mul v26.4s, v26.4s, v12.4s\n"
- "mul v27.4s, v27.4s, v12.4s\n"
- "mul v28.4s, v28.4s, v12.4s\n"
- "mul v29.4s, v29.4s, v12.4s\n"
- "mul v30.4s, v30.4s, v12.4s\n"
- "mul v31.4s, v31.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v21.4s\n"
+ "movi v21.4s, #0x0\n"
+ "add v12.4s, v8.4s, v12.4s\n"
+ "movi v8.4s, #0x0\n"
+ ".inst 0x6f83e250 // udot v16.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e25d // udot v29.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6f83ea55 // udot v21.4s, v18.16b, v3.4b[2]\n"
+ "add v20.4s, v20.4s, v13.4s\n"
+ ".inst 0x6fa3ea48 // udot v8.4s, v18.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v15.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v28.4s, v25.4s\n"
+ "add v26.4s, v26.4s, v30.4s\n"
+ "add v27.4s, v27.4s, v31.4s\n"
+ "add v28.4s, v23.4s, v16.4s\n"
+ "add v29.4s, v12.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v21.4s\n"
+ "add v31.4s, v17.4s, v8.4s\n"
+ "mul v24.4s, v24.4s, v19.4s\n"
+ "mul v25.4s, v25.4s, v19.4s\n"
+ "mul v26.4s, v26.4s, v19.4s\n"
+ "mul v27.4s, v27.4s, v19.4s\n"
+ "mul v28.4s, v28.4s, v19.4s\n"
+ "mul v29.4s, v29.4s, v19.4s\n"
+ "mul v30.4s, v30.4s, v19.4s\n"
+ "mul v31.4s, v31.4s, v19.4s\n"
"zip1 v19.4s, v24.4s, v26.4s\n"
"zip1 v18.4s, v25.4s, v27.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
"zip1 v16.4s, v29.4s, v31.4s\n"
"zip1 v22.4s, v19.4s, v18.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
"zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v11.4s\n"
- "add v25.4s, v25.4s, v11.4s\n"
- "add v26.4s, v26.4s, v11.4s\n"
- "add v27.4s, v27.4s, v11.4s\n"
- "add v28.4s, v28.4s, v11.4s\n"
- "add v29.4s, v29.4s, v11.4s\n"
- "add v30.4s, v30.4s, v11.4s\n"
- "add v31.4s, v31.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
"ble 2f\n"
"1:" // Loop
"ldr q8, [%x[params], #0x0]\n"
@@ -207,96 +207,96 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
- "cmp %x[n_channels], #0x4\n"
- "add x9, x9, #0x10\n"
- ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
"ldr q5, [%x[params], #0x30]\n"
- ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v8.4s\n"
".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
"ldr q6, [%x[params], #0x40]\n"
- "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v21.16b\n"
".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
"ldr q7, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "and v19.16b, v24.16b, v21.16b\n"
"and v18.16b, v25.16b, v21.16b\n"
"and v17.16b, v26.16b, v21.16b\n"
"and v16.16b, v27.16b, v21.16b\n"
- "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v8.4s\n"
- "sqrdmulh v29.4s, v29.4s, v8.4s\n"
- "sqrdmulh v30.4s, v30.4s, v8.4s\n"
- "sqrdmulh v31.4s, v31.4s, v8.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
+ "and v18.16b, v29.16b, v21.16b\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
"and v17.16b, v30.16b, v21.16b\n"
"and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v21.4s\n"
- "srshl v25.4s, v25.4s, v21.4s\n"
"srshl v26.4s, v26.4s, v21.4s\n"
"srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v21.4s\n"
"srshl v29.4s, v29.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
"srshl v30.4s, v30.4s, v21.4s\n"
"srshl v31.4s, v31.4s, v21.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v9.4s\n"
+ "smin v27.4s, v27.4s, v9.4s\n"
+ "smin v28.4s, v28.4s, v9.4s\n"
+ "smin v29.4s, v29.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v10.4s\n"
+ "smax v25.4s, v25.4s, v10.4s\n"
+ "smax v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v10.4s\n"
+ "smax v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v10.4s\n"
+ "smax v30.4s, v30.4s, v10.4s\n"
+ "smax v31.4s, v31.4s, v10.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -307,33 +307,33 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s24, [x27, x28]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s25, [x26, x28]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s26, [x25, x28]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s27, [x24, x28]\n"
- "str s28, [x23, x28]\n"
+ "str s24, [x27, x28]\n"
+ "str s25, [x26, x28]\n"
"dup v24.4s, v22.s[0]\n"
"dup v25.4s, v22.s[1]\n"
- "str s29, [x22, x28]\n"
+ "str s26, [x25, x28]\n"
"dup v26.4s, v22.s[2]\n"
+ "str s27, [x24, x28]\n"
"dup v27.4s, v22.s[3]\n"
- "str s30, [x21, x28]\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str s28, [x23, x28]\n"
"dup v28.4s, v23.s[0]\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "str s29, [x22, x28]\n"
"dup v29.4s, v23.s[1]\n"
- "str s31, [x20, x28]\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "str s30, [x21, x28]\n"
"dup v30.4s, v23.s[2]\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "str s31, [x20, x28]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v20.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v26.4s, v26.4s, v20.4s\n"
- "add v27.4s, v27.4s, v20.4s\n"
"add v28.4s, v28.4s, v20.4s\n"
"add v29.4s, v29.4s, v20.4s\n"
"add v30.4s, v30.4s, v20.4s\n"
@@ -348,98 +348,98 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
"cmp %x[n_channels], #0x4\n"
"add x27, x27, x28\n"
- ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
"add x26, x26, x28\n"
"add x25, x25, x28\n"
- ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
"add x24, x24, x28\n"
"add x23, x23, x28\n"
- ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
"add x22, x22, x28\n"
"add x21, x21, x28\n"
- ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
"add x20, x20, x28\n"
"add %x[params], %x[params], #0x20\n"
- ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
- "and v19.16b, v24.16b, v20.16b\n"
".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
"and v18.16b, v29.16b, v20.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
"and v17.16b, v30.16b, v20.16b\n"
"and v16.16b, v31.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v20.4s\n"
"srshl v29.4s, v29.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v14.4s\n"
- "add v25.4s, v25.4s, v14.4s\n"
- "add v26.4s, v26.4s, v14.4s\n"
- "add v27.4s, v27.4s, v14.4s\n"
- "add v28.4s, v28.4s, v14.4s\n"
- "add v29.4s, v29.4s, v14.4s\n"
- "add v30.4s, v30.4s, v14.4s\n"
- "add v31.4s, v31.4s, v14.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v13.4s\n"
- "smax v25.4s, v25.4s, v13.4s\n"
- "smax v26.4s, v26.4s, v13.4s\n"
- "smax v27.4s, v27.4s, v13.4s\n"
- "smax v28.4s, v28.4s, v13.4s\n"
- "smax v29.4s, v29.4s, v13.4s\n"
- "smax v30.4s, v30.4s, v13.4s\n"
- "smax v31.4s, v31.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v24.4s, v24.4s, v9.4s\n"
+ "smin v25.4s, v25.4s, v9.4s\n"
+ "smin v26.4s, v26.4s, v9.4s\n"
+ "smin v27.4s, v27.4s, v9.4s\n"
+ "smin v28.4s, v28.4s, v9.4s\n"
+ "smin v29.4s, v29.4s, v9.4s\n"
+ "smin v30.4s, v30.4s, v9.4s\n"
+ "smin v31.4s, v31.4s, v9.4s\n"
+ "smax v24.4s, v24.4s, v10.4s\n"
+ "smax v25.4s, v25.4s, v10.4s\n"
+ "smax v26.4s, v26.4s, v10.4s\n"
+ "smax v27.4s, v27.4s, v10.4s\n"
+ "smax v28.4s, v28.4s, v10.4s\n"
+ "smax v29.4s, v29.4s, v10.4s\n"
+ "smax v30.4s, v30.4s, v10.4s\n"
+ "smax v31.4s, v31.4s, v10.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -509,7 +509,7 @@ void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
"4:" // Tail: End
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 61cec2b66d..4558812cbb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,158 +41,158 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "ldr q12, [%x[params], #0x0]\n"
+ "ldr q22, [%x[params], #0x0]\n"
"ldr q8, [%x[params], #0x10]\n"
- "movi v30.16b, #0x1\n"
- "movi v17.4s, #0x0\n"
+ "movi v23.16b, #0x1\n"
+ "movi v19.4s, #0x0\n"
"ldr q9, [%x[params], #0x20]\n"
"ldr q10, [%x[params], #0x30]\n"
- "movi v16.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
"ldr q11, [%x[params], #0x40]\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "movi v24.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "ld1 { v3.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x20]\n"
- "mov v26.16b, v3.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "ld1 { v4.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "mov v21.16b, v4.16b\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- "ld1 { v2.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x8]\n"
- "mov v27.16b, v2.16b\n"
- "ext v27.16b, v27.16b, v27.16b, #0x1\n"
- "ld1 { v1.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x28]\n"
- "zip1 v3.2d, v3.2d, v26.2d\n"
- "zip1 v4.2d, v4.2d, v21.2d\n"
- "ld1 { v5.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x30]\n"
- "mov v26.16b, v1.16b\n"
- "mov v22.16b, v5.16b\n"
- "ld1 { v6.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x38]\n"
- "mov v19.16b, v6.16b\n"
- "ext v26.16b, v26.16b, v26.16b, #0x1\n"
- "ld1 { v7.16b }, [x20]\n"
- "ldr x20, [%x[inptrs], #0x0]\n"
- "mov v21.16b, v7.16b\n"
- "zip1 v2.2d, v2.2d, v27.2d\n"
- "ld1 { v0.16b }, [x20]\n"
- "ext v22.16b, v22.16b, v22.16b, #0x1\n"
- "ext v19.16b, v19.16b, v19.16b, #0x1\n"
- ".inst 0x6f83e3d1 // udot v17.4s, v30.16b, v3.4b[0]\n"
- "ext v21.16b, v21.16b, v21.16b, #0x1\n"
- ".inst 0x6f83ebd0 // udot v16.4s, v30.16b, v3.4b[2]\n"
- ".inst 0x6f84e3d9 // udot v25.4s, v30.16b, v4.4b[0]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v23.4s }, [x20]\n"
- ".inst 0x6f84ebd8 // udot v24.4s, v30.16b, v4.4b[2]\n"
- "mov v18.16b, v0.16b\n"
- ".inst 0x6f82e3df // udot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v28.4s, #0x0\n"
+ "ldr x24, [%x[inptrs], #0x20]\n"
+ "ldr x23, [%x[inptrs], #0x10]\n"
"movi v29.4s, #0x0\n"
- "movi v28.4s, #0x1\n"
- ".inst 0x6f82ebdd // udot v29.4s, v30.16b, v2.4b[2]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.4s }, [x20]\n"
- "ext v18.16b, v18.16b, v18.16b, #0x1\n"
- "zip1 v1.2d, v1.2d, v26.2d\n"
- ".inst 0x6fa3e391 // udot v17.4s, v28.16b, v3.4b[1]\n"
- "zip1 v5.2d, v5.2d, v22.2d\n"
- "zip1 v6.2d, v6.2d, v19.2d\n"
- ".inst 0x6fa3eb90 // udot v16.4s, v28.16b, v3.4b[3]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x20]\n"
- "zip1 v7.2d, v7.2d, v21.2d\n"
- "movi v22.4s, #0x0\n"
- ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v25.4s, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x8]\n"
+ "ldr x21, [%x[inptrs], #0x28]\n"
"movi v21.4s, #0x0\n"
- ".inst 0x6fa4eb98 // udot v24.4s, v28.16b, v4.4b[3]\n"
- ".inst 0x6f81e3d6 // udot v22.4s, v30.16b, v1.4b[0]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v15.4s }, [x20]\n"
+ "movi v16.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ "ld1 { v4.16b }, [x24]\n"
+ "ld1 { v2.16b }, [x23]\n"
+ "movi v30.4s, #0x0\n"
"movi v20.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- ".inst 0x6f85e3da // udot v26.4s, v30.16b, v5.4b[0]\n"
- "cmp %x[n_channels], #0x4\n"
- "zip1 v0.2d, v0.2d, v18.2d\n"
+ "ld1 { v1.16b }, [x22]\n"
+ "ld1 { v5.16b }, [x21]\n"
+ "movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
- ".inst 0x6f85ebdb // udot v27.4s, v30.16b, v5.4b[2]\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "mov v7.16b, v3.16b\n"
+ "ldr x22, [%x[inptrs], #0x38]\n"
+ "movi v24.4s, #0x0\n"
+ "mov v0.16b, v4.16b\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov v14.16b, v2.16b\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "add x11, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "add x10, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "cmp %x[n_channels], #0x4\n"
"mov x9, #0x0\n"
- ".inst 0x6f86e3d4 // udot v20.4s, v30.16b, v6.4b[0]\n"
- ".inst 0x6f86ebd3 // udot v19.4s, v30.16b, v6.4b[2]\n"
- "add v17.4s, v17.4s, v25.4s\n"
"mov x28, #0x0\n"
- "movi v25.4s, #0x0\n"
- ".inst 0x6f87e3d2 // udot v18.4s, v30.16b, v7.4b[0]\n"
- ".inst 0x6f87ebd9 // udot v25.4s, v30.16b, v7.4b[2]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- ".inst 0x6fa2e39f // udot v31.4s, v28.16b, v2.4b[1]\n"
- ".inst 0x6fa2eb9d // udot v29.4s, v28.16b, v2.4b[3]\n"
- "add v16.4s, v16.4s, v24.4s\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "movi v24.4s, #0x0\n"
- ".inst 0x6f80e3d8 // udot v24.4s, v30.16b, v0.4b[0]\n"
- ".inst 0x6fa1e396 // udot v22.4s, v28.16b, v1.4b[1]\n"
+ "zip1 v3.2d, v3.2d, v7.2d\n"
+ "ld1 { v7.16b }, [x22]\n"
+ "neg v12.4s, v12.4s\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- ".inst 0x6fa1eb95 // udot v21.4s, v28.16b, v1.4b[3]\n"
- ".inst 0x6fa5e39a // udot v26.4s, v28.16b, v5.4b[1]\n"
- "add v31.4s, v31.4s, v17.4s\n"
+ "zip1 v4.2d, v4.2d, v0.2d\n"
+ "ld1 { v0.16b }, [x21]\n"
+ "zip1 v2.2d, v2.2d, v14.2d\n"
+ "ld1r { v14.4s }, [x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- ".inst 0x6fa5eb9b // udot v27.4s, v28.16b, v5.4b[3]\n"
- ".inst 0x6fa6e394 // udot v20.4s, v28.16b, v6.4b[1]\n"
- "add v29.4s, v29.4s, v16.4s\n"
"add %x[params], %x[params], #0x50\n"
- ".inst 0x6fa6eb93 // udot v19.4s, v28.16b, v6.4b[3]\n"
- ".inst 0x6fa7e392 // udot v18.4s, v28.16b, v7.4b[1]\n"
- "add v22.4s, v22.4s, v31.4s\n"
- ".inst 0x6fa7eb99 // udot v25.4s, v28.16b, v7.4b[3]\n"
- ".inst 0x6fa0e398 // udot v24.4s, v28.16b, v0.4b[1]\n"
- "add v21.4s, v21.4s, v29.4s\n"
- "add v20.4s, v26.4s, v20.4s\n"
- "add v19.4s, v27.4s, v19.4s\n"
- "add v18.4s, v18.4s, v17.4s\n"
- "movi v17.4s, #0x0\n"
- ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
- ".inst 0x6fa0eb91 // udot v17.4s, v28.16b, v0.4b[3]\n"
- "add v16.4s, v25.4s, v16.4s\n"
- "add v24.4s, v22.4s, v24.4s\n"
- "add v25.4s, v21.4s, v17.4s\n"
- "add v26.4s, v26.4s, v22.4s\n"
- "add v27.4s, v27.4s, v21.4s\n"
- "add v28.4s, v20.4s, v31.4s\n"
- "add v29.4s, v19.4s, v29.4s\n"
- "add v30.4s, v20.4s, v18.4s\n"
- "add v31.4s, v19.4s, v16.4s\n"
- "neg v23.4s, v23.4s\n"
- "mul v24.4s, v24.4s, v23.4s\n"
- "mul v25.4s, v25.4s, v23.4s\n"
- "mul v26.4s, v26.4s, v23.4s\n"
- "mul v27.4s, v27.4s, v23.4s\n"
- "mul v28.4s, v28.4s, v23.4s\n"
- "mul v29.4s, v29.4s, v23.4s\n"
- "mul v30.4s, v30.4s, v23.4s\n"
- "mul v31.4s, v31.4s, v23.4s\n"
- "zip1 v19.4s, v24.4s, v26.4s\n"
- "zip1 v18.4s, v25.4s, v27.4s\n"
+ ".inst 0x6f83e2f3 // udot v19.4s, v23.16b, v3.4b[0]\n"
+ ".inst 0x6f83eaed // udot v13.4s, v23.16b, v3.4b[2]\n"
+ ".inst 0x6f84e2ef // udot v15.4s, v23.16b, v4.4b[0]\n"
+ ".inst 0x6f84eaff // udot v31.4s, v23.16b, v4.4b[2]\n"
+ ".inst 0x6f82e2fc // udot v28.4s, v23.16b, v2.4b[0]\n"
+ ".inst 0x6f82eafd // udot v29.4s, v23.16b, v2.4b[2]\n"
+ ".inst 0x6fa3e333 // udot v19.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa3eb2d // udot v13.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e32f // udot v15.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa4eb3f // udot v31.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa2e33c // udot v28.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa2eb3d // udot v29.4s, v25.16b, v2.4b[3]\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "ld1r { v15.4s }, [x11]\n"
+ "add v31.4s, v13.4s, v31.4s\n"
+ "mov v13.16b, v1.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ "add v28.4s, v28.4s, v19.4s\n"
+ "add v29.4s, v29.4s, v31.4s\n"
+ "zip1 v1.2d, v1.2d, v13.2d\n"
+ "mov v13.16b, v5.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6f81e2f5 // udot v21.4s, v23.16b, v1.4b[0]\n"
+ ".inst 0x6f81eaf0 // udot v16.4s, v23.16b, v1.4b[2]\n"
+ "zip1 v5.2d, v5.2d, v13.2d\n"
+ "mov v13.16b, v6.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6f85e2fa // udot v26.4s, v23.16b, v5.4b[0]\n"
+ ".inst 0x6f85eafb // udot v27.4s, v23.16b, v5.4b[2]\n"
+ ".inst 0x6fa1e335 // udot v21.4s, v25.16b, v1.4b[1]\n"
+ "zip1 v6.2d, v6.2d, v13.2d\n"
+ "mov v13.16b, v7.16b\n"
+ ".inst 0x6fa1eb30 // udot v16.4s, v25.16b, v1.4b[3]\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x6f86e2fe // udot v30.4s, v23.16b, v6.4b[0]\n"
+ ".inst 0x6f86eaf4 // udot v20.4s, v23.16b, v6.4b[2]\n"
+ ".inst 0x6fa5e33a // udot v26.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ "add v21.4s, v21.4s, v28.4s\n"
+ "zip1 v7.2d, v7.2d, v13.2d\n"
+ "ld1r { v13.4s }, [x10]\n"
+ "add v16.4s, v16.4s, v29.4s\n"
+ ".inst 0x6fa6e33e // udot v30.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6fa6eb34 // udot v20.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6f87e2f1 // udot v17.4s, v23.16b, v7.4b[0]\n"
+ ".inst 0x6f87eaf2 // udot v18.4s, v23.16b, v7.4b[2]\n"
+ "add v30.4s, v26.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6fa7e331 // udot v17.4s, v25.16b, v7.4b[1]\n"
+ "add v20.4s, v27.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v16.4s\n"
+ "add v28.4s, v30.4s, v28.4s\n"
+ ".inst 0x6fa7eb32 // udot v18.4s, v25.16b, v7.4b[3]\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "add v19.4s, v17.4s, v19.4s\n"
+ "mov v17.16b, v0.16b\n"
+ "add v29.4s, v20.4s, v29.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x1\n"
+ "add v31.4s, v18.4s, v31.4s\n"
+ "movi v18.4s, #0x0\n"
+ "add v30.4s, v30.4s, v19.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "zip1 v0.2d, v0.2d, v17.2d\n"
+ "add v31.4s, v20.4s, v31.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ ".inst 0x6f80e2f8 // udot v24.4s, v23.16b, v0.4b[0]\n"
+ ".inst 0x6f80eaf2 // udot v18.4s, v23.16b, v0.4b[2]\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
"zip1 v17.4s, v28.4s, v30.4s\n"
- "zip1 v16.4s, v29.4s, v31.4s\n"
- "zip1 v22.4s, v19.4s, v18.4s\n"
- "zip1 v23.4s, v17.4s, v16.4s\n"
- "add v24.4s, v24.4s, v12.4s\n"
- "add v25.4s, v25.4s, v12.4s\n"
- "add v26.4s, v26.4s, v12.4s\n"
- "add v27.4s, v27.4s, v12.4s\n"
- "add v28.4s, v28.4s, v12.4s\n"
- "add v29.4s, v29.4s, v12.4s\n"
- "add v30.4s, v30.4s, v12.4s\n"
- "add v31.4s, v31.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v22.4s\n"
+ "add v30.4s, v30.4s, v22.4s\n"
+ ".inst 0x6fa0e338 // udot v24.4s, v25.16b, v0.4b[1]\n"
+ "zip1 v19.4s, v29.4s, v31.4s\n"
+ "add v29.4s, v29.4s, v22.4s\n"
+ ".inst 0x6fa0eb32 // udot v18.4s, v25.16b, v0.4b[3]\n"
+ "add v31.4s, v31.4s, v22.4s\n"
+ "add v24.4s, v21.4s, v24.4s\n"
+ "zip1 v23.4s, v17.4s, v19.4s\n"
+ "add v25.4s, v16.4s, v18.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "zip1 v17.4s, v24.4s, v26.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "zip1 v16.4s, v25.4s, v27.4s\n"
+ "add v27.4s, v27.4s, v22.4s\n"
+ "add v24.4s, v24.4s, v22.4s\n"
+ "add v25.4s, v25.4s, v22.4s\n"
+ "zip1 v22.4s, v17.4s, v16.4s\n"
"ble 2f\n"
"1:" // Loop
"ldr q12, [%x[params], #0x60]\n"
@@ -203,159 +203,159 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
- ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
- "cmp %x[n_channels], #0x4\n"
- "add x9, x9, #0x10\n"
- ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
"ldr q17, [%x[params], #0x0]\n"
- ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q16, [%x[params], #0x10]\n"
- ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
- ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
- ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q19, [%x[params], #0x20]\n"
- ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
- ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
- ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
- ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q18, [%x[params], #0x30]\n"
- ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
- ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
- ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
- ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
"ldr q17, [%x[params], #0x40]\n"
- ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
- ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
- ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
- ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
"ldr q16, [%x[params], #0x50]\n"
- ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
- ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
- ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
- ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
"ldr q10, [%x[params], #0xb0]\n"
- ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
- ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
- ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
- ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
"ldr q11, [%x[params], #0xc0]\n"
- ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
- ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v12.4s\n"
- ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
- ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v12.4s\n"
".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
"ldr q8, [%x[params], #0x90]\n"
- "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v21.16b\n"
".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
"ldr q9, [%x[params], #0xa0]\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "and v19.16b, v24.16b, v21.16b\n"
"and v18.16b, v25.16b, v21.16b\n"
"and v17.16b, v26.16b, v21.16b\n"
"and v16.16b, v27.16b, v21.16b\n"
- "add %x[params], %x[params], #0xd0\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v12.4s\n"
- "sqrdmulh v29.4s, v29.4s, v12.4s\n"
- "sqrdmulh v30.4s, v30.4s, v12.4s\n"
- "sqrdmulh v31.4s, v31.4s, v12.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
+ "and v18.16b, v29.16b, v21.16b\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
"sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v21.16b\n"
- "and v18.16b, v29.16b, v21.16b\n"
"and v17.16b, v30.16b, v21.16b\n"
"and v16.16b, v31.16b, v21.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v21.4s\n"
- "srshl v25.4s, v25.4s, v21.4s\n"
"srshl v26.4s, v26.4s, v21.4s\n"
"srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v21.4s\n"
"srshl v29.4s, v29.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v21.4s\n"
"srshl v31.4s, v31.4s, v21.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v13.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "add v31.4s, v31.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -366,33 +366,33 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
- "str s24, [x27, x28]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
- "str s25, [x26, x28]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
- "str s26, [x25, x28]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
- "str s27, [x24, x28]\n"
- "str s28, [x23, x28]\n"
+ "str s24, [x27, x28]\n"
+ "str s25, [x26, x28]\n"
"dup v24.4s, v22.s[0]\n"
"dup v25.4s, v22.s[1]\n"
- "str s29, [x22, x28]\n"
+ "str s26, [x25, x28]\n"
"dup v26.4s, v22.s[2]\n"
+ "str s27, [x24, x28]\n"
"dup v27.4s, v22.s[3]\n"
- "str s30, [x21, x28]\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str s28, [x23, x28]\n"
"dup v28.4s, v23.s[0]\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "str s29, [x22, x28]\n"
"dup v29.4s, v23.s[1]\n"
- "str s31, [x20, x28]\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "str s30, [x21, x28]\n"
"dup v30.4s, v23.s[2]\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "str s31, [x20, x28]\n"
"dup v31.4s, v23.s[3]\n"
"add x28, x28, #0x4\n"
- "add v24.4s, v24.4s, v20.4s\n"
- "add v25.4s, v25.4s, v20.4s\n"
- "add v26.4s, v26.4s, v20.4s\n"
- "add v27.4s, v27.4s, v20.4s\n"
"add v28.4s, v28.4s, v20.4s\n"
"add v29.4s, v29.4s, v20.4s\n"
"add v30.4s, v30.4s, v20.4s\n"
@@ -407,160 +407,160 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
"cmp %x[n_channels], #0x4\n"
"add x27, x27, x28\n"
- ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
- ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
"add x26, x26, x28\n"
"add x25, x25, x28\n"
- ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
- ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
"add x24, x24, x28\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
"add x23, x23, x28\n"
- ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
"add x22, x22, x28\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
"add x21, x21, x28\n"
- ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
- "ldr q17, [%x[params], #0x0]\n"
"add x20, x20, x28\n"
- ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
"ldr q16, [%x[params], #0x10]\n"
- ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
- ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
- ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
- ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
"ldr q19, [%x[params], #0x20]\n"
- ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
- ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
- ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
- ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
"ldr q18, [%x[params], #0x30]\n"
- ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
- ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
- ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
- ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
"ldr q17, [%x[params], #0x40]\n"
- ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
- ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
- ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
- ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
"ldr q16, [%x[params], #0x50]\n"
"add %x[params], %x[params], #0x80\n"
- ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
- ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
- ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
- ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
- ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
- ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
- ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
- ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
- ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
- ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
- ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
- "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
- "sqrdmulh v26.4s, v26.4s, v21.4s\n"
".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
- "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
- "and v19.16b, v24.16b, v20.16b\n"
".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
"and v18.16b, v25.16b, v20.16b\n"
"and v17.16b, v26.16b, v20.16b\n"
"and v16.16b, v27.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v28.4s, v28.4s, v21.4s\n"
- "sqrdmulh v29.4s, v29.4s, v21.4s\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "sqrdmulh v31.4s, v31.4s, v21.4s\n"
"sqadd v24.4s, v24.4s, v19.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
"sqadd v25.4s, v25.4s, v18.4s\n"
"sqadd v26.4s, v26.4s, v17.4s\n"
- "sqadd v27.4s, v27.4s, v16.4s\n"
- "and v19.16b, v28.16b, v20.16b\n"
"and v18.16b, v29.16b, v20.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
"and v17.16b, v30.16b, v20.16b\n"
"and v16.16b, v31.16b, v20.16b\n"
"sshr v19.4s, v19.4s, #0x1f\n"
"sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
"sqadd v28.4s, v28.4s, v19.4s\n"
"sqadd v29.4s, v29.4s, v18.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v24.4s, v24.4s, v20.4s\n"
- "srshl v25.4s, v25.4s, v20.4s\n"
"srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
"srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
"srshl v28.4s, v28.4s, v20.4s\n"
"srshl v29.4s, v29.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
"srshl v30.4s, v30.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
"srshl v31.4s, v31.4s, v20.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "add v25.4s, v25.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v27.4s, v27.4s, v13.4s\n"
- "add v28.4s, v28.4s, v13.4s\n"
- "add v29.4s, v29.4s, v13.4s\n"
- "add v30.4s, v30.4s, v13.4s\n"
- "add v31.4s, v31.4s, v13.4s\n"
- "smin v24.4s, v24.4s, v15.4s\n"
- "smin v25.4s, v25.4s, v15.4s\n"
- "smin v26.4s, v26.4s, v15.4s\n"
- "smin v27.4s, v27.4s, v15.4s\n"
- "smin v28.4s, v28.4s, v15.4s\n"
- "smin v29.4s, v29.4s, v15.4s\n"
- "smin v30.4s, v30.4s, v15.4s\n"
- "smin v31.4s, v31.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v14.4s\n"
- "smax v25.4s, v25.4s, v14.4s\n"
- "smax v26.4s, v26.4s, v14.4s\n"
- "smax v27.4s, v27.4s, v14.4s\n"
- "smax v28.4s, v28.4s, v14.4s\n"
- "smax v29.4s, v29.4s, v14.4s\n"
- "smax v30.4s, v30.4s, v14.4s\n"
- "smax v31.4s, v31.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v13.4s\n"
+ "smin v25.4s, v25.4s, v13.4s\n"
+ "smin v26.4s, v26.4s, v13.4s\n"
+ "smin v27.4s, v27.4s, v13.4s\n"
+ "smin v28.4s, v28.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v13.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
@@ -630,7 +630,7 @@ void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
"4:" // Tail: End
: [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
: [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 0770c126ec..82d7f407e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,21 +49,21 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v15.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.4s }, [x21]\n"
"ld1r { v13.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.16b }, [x21]\n"
"ld1r { v11.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v10.4s }, [x21]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
+ "ld1r { v8.4s }, [x20]\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
"movi v31.4s, #0x0\n"
@@ -96,20 +96,20 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"3:" // Output channel loop: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 7f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
@@ -125,13 +125,13 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -139,22 +139,22 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -172,54 +172,54 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v19.4s, v5.4h, v0.h[3]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v7.4h, v3.h[0]\n"
- "smlal v17.4s, v7.4h, v3.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v7.4h, v3.h[2]\n"
- "smlal v19.4s, v7.4h, v3.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v7.4h, v6.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -357,49 +357,49 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -421,70 +421,70 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x20, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr s5, [%x[weights]], #0x4\n"
"ldr d4, [x28, #0x0]\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "smlal v20.4s, v7.4h, v3.h[4]\n"
- "smlal v21.4s, v7.4h, v3.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "smlal v22.4s, v7.4h, v3.h[6]\n"
- "smlal v23.4s, v7.4h, v3.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -622,49 +622,49 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -673,45 +673,45 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"7:" // Output channel loop: Single kernel point
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
- "smlal v20.4s, v5.4h, v0.h[4]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
"and v3.16b, v16.16b, v8.16b\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
"and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
"and v1.16b, v18.16b, v8.16b\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "smlal v29.4s, v5.4h, v4.h[5]\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v30.4s, v5.4h, v4.h[6]\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
- "sshl v24.4s, v24.4s, v10.4s\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
@@ -848,49 +848,49 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -965,20 +965,20 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"18:" // Output channel oddments: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 22f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "usubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
@@ -994,13 +994,13 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -1008,22 +1008,22 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "usubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -1077,27 +1077,27 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d2, [x21, #0x0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d1, [x20, #0x0]\n"
"ldr s0, [%x[weights]], #0x4\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "usubl v0.8h, v0.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
@@ -1145,18 +1145,18 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v10.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
"and v3.16b, v16.16b, v8.16b\n"
"and v2.16b, v17.16b, v8.16b\n"
"and v1.16b, v18.16b, v8.16b\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -1320,47 +1320,47 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"tbz %x[n_output_channels], #1, 24f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.h }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.h }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.h }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.h }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.h }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.h }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.h }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.h }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
- "add x9, x9, #0x2\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.h }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.h }[0], [x26]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
"st1 { v26.h }[0], [x25]\n"
"st1 { v27.h }[0], [x24]\n"
"st1 { v28.h }[0], [x23]\n"
@@ -1370,46 +1370,46 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"tbz %x[n_output_channels], #0, 25f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[2], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[2], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[2], [x25]\n"
"st1 { v27.b }[2], [x24]\n"
"st1 { v28.b }[2], [x23]\n"
@@ -1420,46 +1420,46 @@ void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_imp
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[0], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[0], [x25]\n"
"st1 { v27.b }[0], [x24]\n"
"st1 { v28.b }[0], [x23]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index d1872c90f8..62ad1fc0f5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,1070 +91,1070 @@ void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x15, x16, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v18.16b }, [x20]\n"
+ "mov x16, #0x0\n"
+ "mov x15, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x10, x17, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v29.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v5.8h }, [x21]\n"
- "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v12.8h }, [x21]\n"
+ "ld1r { v15.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "mov x14, #0x0\n"
- "ld1r { v12.8h }, [x20]\n"
- "mov x13, #0x0\n"
- "add x12, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x28, x27, [x22, #0x0]\n"
- "ldp x26, x25, [x22, #0x10]\n"
- "cbz x15, 3f\n"
- "ldr d19, [x11, #0x0]\n"
- "ldr d7, [x11, #0x8]\n"
- "subs x15, x15, #0x1\n"
- "usubl v19.8h, v19.8b, v18.8b\n"
- "ldr d1, [x11, #0x10]\n"
- "ldr d17, [x11, #0x18]\n"
- "usubl v7.8h, v7.8b, v18.8b\n"
- "usubl v1.8h, v1.8b, v18.8b\n"
- "ldr d8, [x11, #0x20]\n"
- "ldr d31, [x11, #0x28]\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "ldr d29, [x11, #0x30]\n"
- "ldr d16, [x11, #0x38]\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "ldr d4, [x11, #0x40]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "ldp x9, x28, [x22, #0x0]\n"
+ "ldp x27, x26, [x22, #0x10]\n"
+ "cbz x10, 3f\n"
+ "ldr d22, [x13, #0x0]\n"
+ "ldr d24, [x13, #0x8]\n"
+ "subs x10, x10, #0x1\n"
+ "ldr d9, [x13, #0x10]\n"
+ "ldr d7, [x13, #0x18]\n"
+ "ldr d25, [x13, #0x20]\n"
+ "ldr d4, [x13, #0x28]\n"
+ "ldr d13, [x13, #0x30]\n"
+ "ldr d14, [x13, #0x38]\n"
+ "usubl v22.8h, v22.8b, v29.8b\n"
+ "usubl v24.8h, v24.8b, v29.8b\n"
+ "ldr d2, [x13, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "ldr q28, [x20, #0x0]\n"
- "ldr q9, [x20, #0x10]\n"
+ "usubl v9.8h, v9.8b, v29.8b\n"
+ "usubl v7.8h, v7.8b, v29.8b\n"
+ "usubl v25.8h, v25.8b, v29.8b\n"
+ "usubl v4.8h, v4.8b, v29.8b\n"
+ "usubl v13.8h, v13.8b, v29.8b\n"
+ "usubl v14.8h, v14.8b, v29.8b\n"
+ "ldr q20, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x23, x22, [x14, #0x0]\n"
"add x20, x20, #0x20\n"
+ "usubl v2.8h, v2.8b, v29.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "mov v3.16b, v28.16b\n"
- "mov v30.16b, v9.16b\n"
- "ldr d23, [x23, x14]\n"
- "ldr d10, [x22, x14]\n"
- "mov v0.16b, v28.16b\n"
- "mov v22.16b, v9.16b\n"
- "ldr d11, [x21, x14]\n"
- "ldr d13, [x20, x14]\n"
- "mov v6.16b, v28.16b\n"
- "mov v2.16b, v9.16b\n"
- "ldr x20, [x12, #0x20]\n"
- "ldr d27, [x20, x14]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ushll v10.8h, v10.8b, #0x0\n"
+ "mov v8.16b, v20.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "mov v3.16b, v20.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "mov v10.16b, v20.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d11, [x23, x16]\n"
+ "ldr d28, [x22, x16]\n"
+ "ldr d18, [x21, x16]\n"
+ "ldr d19, [x20, x16]\n"
+ "ldr x20, [x14, #0x20]\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "ushll v13.8h, v13.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr d23, [x20, x16]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q24, [x10, #0x0]\n"
- "ldr q25, [x9, #0x0]\n"
- "smlal v28.4s, v23.4h, v8.4h\n"
- "smlal2 v9.4s, v23.8h, v8.8h\n"
- "ldr q20, [x10, #0x10]\n"
- "ldr q26, [x9, #0x10]\n"
- "smlal v28.4s, v10.4h, v19.4h\n"
- "smlal v3.4s, v23.4h, v17.4h\n"
- "ldr x20, [x12, #0x28]\n"
- "ldr d21, [x20, x14]\n"
- "smlal v0.4s, v23.4h, v7.4h\n"
- "smlal v6.4s, v23.4h, v19.4h\n"
- "smlal2 v9.4s, v10.8h, v19.8h\n"
- "ldr x20, [x12, #0x38]\n"
- "ldr d10, [x20, x14]\n"
- "smlal v28.4s, v13.4h, v31.4h\n"
- "smlal2 v30.4s, v23.8h, v17.8h\n"
- "smlal2 v22.4s, v23.8h, v7.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr d15, [x20, x14]\n"
- "smlal2 v2.4s, v23.8h, v19.8h\n"
- "smlal v3.4s, v11.4h, v1.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "ldr x20, [x12, #0x40]\n"
- "ldr d23, [x20, x14]\n"
- "smlal v0.4s, v13.4h, v1.4h\n"
- "smlal v6.4s, v13.4h, v7.4h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal2 v9.4s, v13.8h, v31.8h\n"
- "smlal v28.4s, v27.4h, v16.4h\n"
- "ldr x20, [x12, #0x48]\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal2 v30.4s, v11.8h, v1.8h\n"
- "ldr d11, [x20, x14]\n"
- "smlal2 v22.4s, v13.8h, v1.8h\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v2.4s, v13.8h, v7.8h\n"
- "smlal v3.4s, v13.4h, v8.4h\n"
- "ldr x21, [x12, #0x50]\n"
- "ldr x20, [x12, #0x58]\n"
- "smlal v0.4s, v21.4h, v29.4h\n"
- "smlal v6.4s, v27.4h, v17.4h\n"
+ "ldr q26, [x12, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ "smlal v20.4s, v11.4h, v25.4h\n"
+ "smlal2 v1.4s, v11.8h, v25.8h\n"
+ "ldr q0, [x12, #0x10]\n"
+ "ldr q5, [x11, #0x10]\n"
+ "smlal v8.4s, v11.4h, v7.4h\n"
+ "smlal v3.4s, v11.4h, v24.4h\n"
+ "ldr x25, [x14, #0x28]\n"
+ "smlal v10.4s, v11.4h, v22.4h\n"
+ "ldr x24, [x14, #0x38]\n"
+ "smlal2 v21.4s, v11.8h, v7.8h\n"
+ "smlal2 v30.4s, v11.8h, v24.8h\n"
+ "smlal2 v27.4s, v11.8h, v22.8h\n"
+ "ldr x23, [x14, #0x30]\n"
+ "ldr x22, [x14, #0x40]\n"
+ "smlal v20.4s, v28.4h, v22.4h\n"
+ "smlal2 v1.4s, v28.8h, v22.8h\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "ldr d16, [x25, x16]\n"
+ "ldr d28, [x24, x16]\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal v3.4s, v19.4h, v9.4h\n"
+ "ldr d31, [x23, x16]\n"
+ "ldr d11, [x22, x16]\n"
+ "smlal v10.4s, v19.4h, v24.4h\n"
+ "smlal2 v21.4s, v18.8h, v9.8h\n"
+ "ldr d18, [x20, x16]\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal2 v27.4s, v19.8h, v24.8h\n"
+ "ldr x20, [x14, #0x58]\n"
+ "smlal v20.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x24, [x14, #0x60]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v25.4h\n"
+ "ldr x23, [x14, #0x68]\n"
+ "ldr x22, [x14, #0x70]\n"
+ "smlal v10.4s, v23.4h, v7.4h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v21.4s, v19.8h, v25.8h\n"
+ "ldr d19, [x21, x16]\n"
+ "smlal v3.4s, v16.4h, v13.4h\n"
+ "smlal2 v30.4s, v16.8h, v13.8h\n"
+ "ldr d16, [x20, x16]\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "ldr x24, [x12, #0x60]\n"
- "smlal2 v9.4s, v27.8h, v16.8h\n"
- "smlal v28.4s, v10.4h, v7.4h\n"
- "ldr x23, [x12, #0x68]\n"
- "ldr x22, [x12, #0x70]\n"
- "smlal2 v30.4s, v13.8h, v8.8h\n"
- "ldr d13, [x21, x14]\n"
- "smlal2 v22.4s, v21.8h, v29.8h\n"
- "ldr d21, [x20, x14]\n"
- "smlal2 v2.4s, v27.8h, v17.8h\n"
- "smlal v3.4s, v27.4h, v29.4h\n"
- "ushll v13.8h, v13.8b, #0x0\n"
- "ldr x21, [x12, #0x78]\n"
- "smlal v0.4s, v27.4h, v8.4h\n"
- "smlal v6.4s, v15.4h, v4.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v20.4s, v23.4h, v14.4h\n"
+ "smlal2 v1.4s, v23.8h, v14.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x21, [x14, #0x78]\n"
+ "smlal2 v27.4s, v23.8h, v7.8h\n"
+ "smlal v8.4s, v23.4h, v13.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v9.4s, v10.8h, v7.8h\n"
- "smlal v28.4s, v23.4h, v1.4h\n"
- "add x11, x11, #0x48\n"
- "subs x15, x15, #0x1\n"
- "smlal2 v30.4s, v27.8h, v29.8h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
- "ldr d27, [x24, x14]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v2.4s, v15.8h, v4.8h\n"
- "ldr d15, [x23, x14]\n"
- "smlal v3.4s, v10.4h, v19.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal v0.4s, v11.4h, v31.4h\n"
- "smlal v6.4s, v11.4h, v8.4h\n"
- "add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "smlal v28.4s, v11.4h, v4.4h\n"
- "smlal2 v30.4s, v10.8h, v19.8h\n"
- "ldr d10, [x22, x14]\n"
- "smlal2 v22.4s, v11.8h, v31.8h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal2 v2.4s, v11.8h, v8.8h\n"
- "ldr d8, [x21, x14]\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v13.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "add x13, x13, #0x48\n"
+ "smlal v3.4s, v23.4h, v25.4h\n"
+ "smlal2 v30.4s, v23.8h, v25.8h\n"
+ "ldr d23, [x24, x16]\n"
+ "subs x10, x10, #0x1\n"
+ "smlal v20.4s, v28.4h, v24.4h\n"
+ "smlal2 v1.4s, v28.8h, v24.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "smlal2 v27.4s, v31.8h, v2.8h\n"
+ "ldr d31, [x23, x16]\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal v10.4s, v18.4h, v25.4h\n"
+ "smlal2 v21.4s, v28.8h, v22.8h\n"
+ "ldr d28, [x22, x16]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v3.4s, v18.4h, v4.4h\n"
+ "smlal2 v30.4s, v18.8h, v4.8h\n"
+ "smlal v20.4s, v11.4h, v9.4h\n"
+ "smlal2 v1.4s, v11.8h, v9.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v27.4s, v18.8h, v25.8h\n"
+ "ldr d25, [x21, x16]\n"
+ "smlal v8.4s, v11.4h, v24.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v10.4s, v16.4h, v9.4h\n"
+ "smlal2 v21.4s, v11.8h, v24.8h\n"
+ "add x16, x16, #0x8\n"
+ "smlal v3.4s, v19.4h, v22.4h\n"
+ "smlal2 v30.4s, v19.8h, v22.8h\n"
+ "smlal v20.4s, v18.4h, v2.4h\n"
+ "smlal2 v1.4s, v18.8h, v2.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v27.4s, v16.8h, v9.8h\n"
+ "smlal v8.4s, v18.4h, v14.4h\n"
+ "smlal v10.4s, v31.4h, v4.4h\n"
+ "smlal2 v21.4s, v18.8h, v14.8h\n"
"smlal v3.4s, v23.4h, v7.4h\n"
- "ushll v8.8h, v8.8b, #0x0\n"
- "smlal v0.4s, v13.4h, v19.4h\n"
- "smlal v6.4s, v21.4h, v1.4h\n"
- "add x14, x14, #0x8\n"
- "smlal2 v9.4s, v11.8h, v4.8h\n"
- "smlal v28.4s, v13.4h, v17.4h\n"
"smlal2 v30.4s, v23.8h, v7.8h\n"
- "smlal2 v22.4s, v13.8h, v19.8h\n"
- "smlal2 v2.4s, v21.8h, v1.8h\n"
- "smlal v3.4s, v11.4h, v16.4h\n"
- "smlal v0.4s, v27.4h, v17.4h\n"
- "smlal v6.4s, v15.4h, v31.4h\n"
- "smlal2 v9.4s, v13.8h, v17.8h\n"
- "smlal v28.4s, v27.4h, v29.4h\n"
- "sqrdmulh v28.4s, v28.4s, v24.4s\n"
- "smlal2 v30.4s, v11.8h, v16.8h\n"
- "smlal2 v22.4s, v27.8h, v17.8h\n"
- "and v17.16b, v28.16b, v25.16b\n"
- "smlal2 v2.4s, v15.8h, v31.8h\n"
- "smlal v3.4s, v21.4h, v31.4h\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "smlal v0.4s, v10.4h, v16.4h\n"
- "smlal v6.4s, v10.4h, v29.4h\n"
- "sqadd v28.4s, v28.4s, v17.4s\n"
- "smlal2 v9.4s, v27.8h, v29.8h\n"
- "smlal2 v30.4s, v21.8h, v31.8h\n"
- "sqrdmulh v9.4s, v9.4s, v20.4s\n"
- "smlal2 v22.4s, v10.8h, v16.8h\n"
- "smlal2 v2.4s, v10.8h, v29.8h\n"
- "and v23.16b, v9.16b, v26.16b\n"
- "smlal v3.4s, v15.4h, v4.4h\n"
- "smlal v0.4s, v8.4h, v4.4h\n"
- "sqrdmulh v3.4s, v3.4s, v24.4s\n"
- "smlal v6.4s, v8.4h, v16.4h\n"
- "smlal2 v30.4s, v15.8h, v4.8h\n"
- "sqrdmulh v0.4s, v0.4s, v24.4s\n"
- "smlal2 v22.4s, v8.8h, v4.8h\n"
- "smlal2 v2.4s, v8.8h, v16.8h\n"
- "sqrdmulh v6.4s, v6.4s, v24.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v8.16b, v3.16b, v25.16b\n"
- "sqrdmulh v30.4s, v30.4s, v20.4s\n"
- "and v11.16b, v0.16b, v25.16b\n"
- "sqrdmulh v22.4s, v22.4s, v20.4s\n"
- "and v29.16b, v6.16b, v25.16b\n"
- "sqrdmulh v2.4s, v2.4s, v20.4s\n"
- "sqadd v9.4s, v9.4s, v23.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "and v13.16b, v30.16b, v26.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v21.16b, v22.16b, v26.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v23.16b, v2.16b, v26.16b\n"
- "sqadd v3.4s, v3.4s, v8.4s\n"
+ "smlal v20.4s, v19.4h, v7.4h\n"
+ "smlal2 v1.4s, v19.8h, v7.8h\n"
+ "smlal2 v27.4s, v31.8h, v4.8h\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v13.4h\n"
+ "smlal2 v21.4s, v16.8h, v4.8h\n"
+ "smlal v3.4s, v28.4h, v14.4h\n"
+ "smlal2 v30.4s, v28.8h, v14.8h\n"
+ "smlal v20.4s, v23.4h, v13.4h\n"
+ "smlal2 v1.4s, v23.8h, v13.8h\n"
+ "smlal2 v27.4s, v28.8h, v13.8h\n"
+ "smlal v8.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v25.4h, v14.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v3.4s, v25.4h, v2.4h\n"
+ "smlal2 v30.4s, v25.8h, v2.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v26.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v0.4s\n"
+ "smlal2 v27.4s, v25.8h, v14.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v26.4s\n"
+ "and v19.16b, v20.16b, v6.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v26.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "and v9.16b, v1.16b, v5.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v16.16b, v8.16b, v6.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v0.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v0.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v22.16b, v3.16b, v6.16b\n"
+ "sqadd v20.4s, v20.4s, v19.4s\n"
+ "and v13.16b, v10.16b, v6.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v28.16b, v21.16b, v5.16b\n"
+ "sqadd v1.4s, v1.4s, v9.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v0.16b, v30.16b, v5.16b\n"
"sshr v13.4s, v13.4s, #0x1f\n"
- "sqadd v0.4s, v0.4s, v11.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v29.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "srshl v28.4s, v28.4s, v25.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqadd v30.4s, v30.4s, v13.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqadd v22.4s, v22.4s, v21.4s\n"
- "srshl v6.4s, v6.4s, v25.4s\n"
- "sqadd v2.4s, v2.4s, v23.4s\n"
- "srshl v9.4s, v9.4s, v26.4s\n"
- "sqxtn v28.4h, v28.4s\n"
- "srshl v30.4s, v30.4s, v26.4s\n"
+ "and v18.16b, v27.16b, v5.16b\n"
+ "sqadd v8.4s, v8.4s, v16.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v22.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v13.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v20.4s, v20.4s, v6.4s\n"
+ "srshl v8.4s, v8.4s, v6.4s\n"
+ "sqadd v21.4s, v21.4s, v28.4s\n"
+ "srshl v3.4s, v3.4s, v6.4s\n"
+ "sqadd v30.4s, v30.4s, v0.4s\n"
+ "srshl v10.4s, v10.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v5.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v30.4s, v30.4s, v5.4s\n"
"sqxtn v3.4h, v3.4s\n"
- "srshl v22.4s, v22.4s, v26.4s\n"
- "sqxtn v0.4h, v0.4s\n"
- "srshl v2.4s, v2.4s, v26.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v28.8h, v9.4s\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v21.4s\n"
"sqxtn2 v3.8h, v30.4s\n"
- "sqxtn2 v0.8h, v22.4s\n"
- "sqxtn2 v6.8h, v2.4s\n"
- "sqadd v28.8h, v28.8h, v5.8h\n"
- "sqadd v3.8h, v3.8h, v5.8h\n"
- "sqadd v0.8h, v0.8h, v5.8h\n"
- "sqadd v6.8h, v6.8h, v5.8h\n"
- "smax v28.8h, v28.8h, v14.8h\n"
- "smax v3.8h, v3.8h, v14.8h\n"
- "smax v0.8h, v0.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v28.8h, v28.8h, v12.8h\n"
- "smin v3.8h, v3.8h, v12.8h\n"
- "smin v0.8h, v0.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str d28, [x28, x13]\n"
+ "sqxtn2 v10.8h, v27.4s\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v3.8h, v3.8h, v12.8h\n"
+ "sqadd v10.8h, v10.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v15.8h\n"
+ "smax v8.8h, v8.8h, v15.8h\n"
+ "smax v3.8h, v3.8h, v15.8h\n"
+ "smax v10.8h, v10.8h, v15.8h\n"
+ "smin v20.8h, v20.8h, v17.8h\n"
+ "smin v8.8h, v8.8h, v17.8h\n"
+ "smin v3.8h, v3.8h, v17.8h\n"
+ "smin v10.8h, v10.8h, v17.8h\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v3.16b, v3.16b, v3.16b\n"
- "uzp1 v0.16b, v0.16b, v0.16b\n"
- "str d3, [x27, x13]\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d0, [x26, x13]\n"
- "str d6, [x25, x13]\n"
- "ldr q28, [x20, #0x0]\n"
- "ldr q9, [x20, #0x10]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d20, [x9, x15]\n"
+ "str d8, [x28, x15]\n"
+ "str d3, [x27, x15]\n"
+ "str d10, [x26, x15]\n"
+ "add x15, x15, #0x8\n"
+ "ldr q20, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d19, [x11, #0x0]\n"
- "ldr d7, [x11, #0x8]\n"
- "add x13, x13, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d1, [x11, #0x10]\n"
- "ldr d17, [x11, #0x18]\n"
- "mov v3.16b, v28.16b\n"
- "mov v30.16b, v9.16b\n"
- "ldr d8, [x11, #0x20]\n"
- "ldr d31, [x11, #0x28]\n"
- "mov v0.16b, v28.16b\n"
- "mov v22.16b, v9.16b\n"
- "ldr d29, [x11, #0x30]\n"
- "ldr d16, [x11, #0x38]\n"
- "mov v6.16b, v28.16b\n"
- "mov v2.16b, v9.16b\n"
- "ldr d4, [x11, #0x40]\n"
- "ldp x23, x22, [x12, #0x0]\n"
- "usubl v19.8h, v19.8b, v18.8b\n"
- "usubl v7.8h, v7.8b, v18.8b\n"
- "ldp x21, x20, [x12, #0x10]\n"
- "ldr d23, [x23, x14]\n"
- "usubl v1.8h, v1.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "ldr d10, [x22, x14]\n"
- "ldr d11, [x21, x14]\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "ldr d13, [x20, x14]\n"
- "ldr x20, [x12, #0x20]\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "ldr d27, [x20, x14]\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d22, [x13, #0x0]\n"
+ "ldr d24, [x13, #0x8]\n"
+ "ldr d9, [x13, #0x10]\n"
+ "ldr d7, [x13, #0x18]\n"
+ "ldr d25, [x13, #0x20]\n"
+ "ldr d4, [x13, #0x28]\n"
+ "mov v8.16b, v20.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d13, [x13, #0x30]\n"
+ "ldr d14, [x13, #0x38]\n"
+ "mov v3.16b, v20.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d2, [x13, #0x40]\n"
+ "ldp x23, x22, [x14, #0x0]\n"
+ "mov v10.16b, v20.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "usubl v22.8h, v22.8b, v29.8b\n"
+ "usubl v24.8h, v24.8b, v29.8b\n"
+ "usubl v9.8h, v9.8b, v29.8b\n"
+ "usubl v7.8h, v7.8b, v29.8b\n"
+ "ldp x21, x20, [x14, #0x10]\n"
+ "usubl v25.8h, v25.8b, v29.8b\n"
+ "usubl v4.8h, v4.8b, v29.8b\n"
+ "usubl v13.8h, v13.8b, v29.8b\n"
+ "usubl v14.8h, v14.8b, v29.8b\n"
+ "ldr d11, [x23, x16]\n"
+ "ldr d28, [x22, x16]\n"
+ "ldr d18, [x21, x16]\n"
+ "ldr d19, [x20, x16]\n"
+ "usubl v2.8h, v2.8b, v29.8b\n"
+ "ldr x20, [x14, #0x20]\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "ushll v13.8h, v13.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr d23, [x20, x16]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q26, [x10, #0x0]\n"
- "ldr q25, [x9, #0x0]\n"
- "smlal v28.4s, v23.4h, v8.4h\n"
- "smlal2 v9.4s, v23.8h, v8.8h\n"
- "ldr q24, [x10, #0x10]\n"
- "ldr q20, [x9, #0x10]\n"
- "smlal v28.4s, v10.4h, v19.4h\n"
- "smlal v3.4s, v23.4h, v17.4h\n"
- "ldr x20, [x12, #0x28]\n"
- "ldr d21, [x20, x14]\n"
- "smlal v0.4s, v23.4h, v7.4h\n"
- "smlal v6.4s, v23.4h, v19.4h\n"
- "smlal2 v9.4s, v10.8h, v19.8h\n"
- "ldr x20, [x12, #0x38]\n"
- "ldr d15, [x20, x14]\n"
- "smlal v28.4s, v13.4h, v31.4h\n"
- "smlal2 v30.4s, v23.8h, v17.8h\n"
- "smlal2 v22.4s, v23.8h, v7.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "ldr d10, [x20, x14]\n"
- "smlal2 v2.4s, v23.8h, v19.8h\n"
- "smlal v3.4s, v11.4h, v1.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "ldr x20, [x12, #0x40]\n"
- "ldr d23, [x20, x14]\n"
- "smlal v0.4s, v13.4h, v1.4h\n"
- "smlal v6.4s, v13.4h, v7.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal2 v9.4s, v13.8h, v31.8h\n"
- "smlal v28.4s, v27.4h, v16.4h\n"
- "ldr x20, [x12, #0x48]\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal2 v30.4s, v11.8h, v1.8h\n"
- "ldr d11, [x20, x14]\n"
- "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ldr q16, [x12, #0x0]\n"
+ "ldr q5, [x11, #0x0]\n"
+ "smlal v20.4s, v11.4h, v25.4h\n"
+ "smlal2 v1.4s, v11.8h, v25.8h\n"
+ "ldr q6, [x12, #0x10]\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal v8.4s, v11.4h, v7.4h\n"
+ "smlal v3.4s, v11.4h, v24.4h\n"
+ "ldr x25, [x14, #0x28]\n"
+ "smlal v10.4s, v11.4h, v22.4h\n"
+ "ldr x23, [x14, #0x38]\n"
+ "smlal2 v21.4s, v11.8h, v7.8h\n"
+ "smlal2 v30.4s, v11.8h, v24.8h\n"
+ "smlal2 v27.4s, v11.8h, v22.8h\n"
+ "ldr x22, [x14, #0x30]\n"
+ "ldr x21, [x14, #0x40]\n"
+ "smlal v20.4s, v28.4h, v22.4h\n"
+ "smlal2 v1.4s, v28.8h, v22.8h\n"
+ "ldr x20, [x14, #0x48]\n"
+ "ldr x24, [x14, #0x50]\n"
+ "ldr d28, [x25, x16]\n"
+ "ldr d26, [x23, x16]\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal v3.4s, v19.4h, v9.4h\n"
+ "ldr d11, [x22, x16]\n"
+ "ldr d0, [x21, x16]\n"
+ "smlal v10.4s, v19.4h, v24.4h\n"
+ "smlal2 v21.4s, v18.8h, v9.8h\n"
+ "ldr d18, [x20, x16]\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal2 v27.4s, v19.8h, v24.8h\n"
+ "ldr x20, [x14, #0x58]\n"
+ "smlal v20.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x23, [x14, #0x60]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v25.4h\n"
+ "ldr x22, [x14, #0x68]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "smlal v10.4s, v23.4h, v7.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v21.4s, v19.8h, v25.8h\n"
+ "ldr d19, [x24, x16]\n"
+ "smlal v3.4s, v28.4h, v13.4h\n"
+ "smlal2 v30.4s, v28.8h, v13.8h\n"
+ "ldr d28, [x20, x16]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v20.4s, v23.4h, v14.4h\n"
+ "smlal2 v1.4s, v23.8h, v14.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x14, #0x78]\n"
+ "smlal2 v27.4s, v23.8h, v7.8h\n"
+ "smlal v8.4s, v23.4h, v13.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "tst x17, #0x7\n"
+ "smlal v10.4s, v11.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v13.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "add x12, x12, #0x20\n"
+ "smlal v3.4s, v23.4h, v25.4h\n"
+ "smlal2 v30.4s, v23.8h, v25.8h\n"
+ "ldr d23, [x23, x16]\n"
+ "add x11, x11, #0x20\n"
+ "smlal v20.4s, v26.4h, v24.4h\n"
+ "smlal2 v1.4s, v26.8h, v24.8h\n"
+ "smlal2 v27.4s, v11.8h, v2.8h\n"
+ "ldr d11, [x22, x16]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v18.4h, v25.4h\n"
+ "smlal2 v21.4s, v26.8h, v22.8h\n"
+ "ldr d26, [x21, x16]\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v2.4s, v13.8h, v7.8h\n"
- "smlal v3.4s, v13.4h, v8.4h\n"
- "ldr x24, [x12, #0x50]\n"
- "ldr x20, [x12, #0x58]\n"
- "smlal v0.4s, v21.4h, v29.4h\n"
- "smlal v6.4s, v27.4h, v17.4h\n"
+ "smlal v3.4s, v18.4h, v4.4h\n"
+ "smlal2 v30.4s, v18.8h, v4.8h\n"
+ "smlal v20.4s, v0.4h, v9.4h\n"
+ "smlal2 v1.4s, v0.8h, v9.8h\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "ldr x23, [x12, #0x60]\n"
- "smlal2 v9.4s, v27.8h, v16.8h\n"
- "smlal v28.4s, v15.4h, v7.4h\n"
- "ldr x22, [x12, #0x68]\n"
- "ldr x21, [x12, #0x70]\n"
- "smlal2 v30.4s, v13.8h, v8.8h\n"
- "ldr d13, [x24, x14]\n"
- "smlal2 v22.4s, v21.8h, v29.8h\n"
- "ldr d21, [x20, x14]\n"
- "smlal2 v2.4s, v27.8h, v17.8h\n"
- "smlal v3.4s, v27.4h, v29.4h\n"
- "ushll v13.8h, v13.8b, #0x0\n"
- "ldr x20, [x12, #0x78]\n"
- "smlal v0.4s, v27.4h, v8.4h\n"
- "smlal v6.4s, v10.4h, v4.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "tst x16, #0x7\n"
- "smlal2 v9.4s, v15.8h, v7.8h\n"
- "smlal v28.4s, v23.4h, v1.4h\n"
- "add x10, x10, #0x20\n"
- "add x9, x9, #0x20\n"
- "smlal2 v30.4s, v27.8h, v29.8h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
- "ldr d27, [x23, x14]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v2.4s, v10.8h, v4.8h\n"
- "ldr d10, [x22, x14]\n"
- "smlal v3.4s, v15.4h, v19.4h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal v0.4s, v11.4h, v31.4h\n"
- "smlal v6.4s, v11.4h, v8.4h\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "smlal v28.4s, v11.4h, v4.4h\n"
- "smlal2 v30.4s, v15.8h, v19.8h\n"
- "ldr d15, [x21, x14]\n"
- "smlal2 v22.4s, v11.8h, v31.8h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal2 v2.4s, v11.8h, v8.8h\n"
- "ldr d8, [x20, x14]\n"
+ "smlal2 v27.4s, v18.8h, v25.8h\n"
+ "ldr d25, [x20, x16]\n"
+ "smlal v8.4s, v0.4h, v24.4h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v10.4s, v28.4h, v9.4h\n"
+ "smlal2 v21.4s, v0.8h, v24.8h\n"
+ "add x16, x16, #0x8\n"
+ "smlal v3.4s, v19.4h, v22.4h\n"
+ "smlal2 v30.4s, v19.8h, v22.8h\n"
+ "smlal v20.4s, v18.4h, v2.4h\n"
+ "smlal2 v1.4s, v18.8h, v2.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v27.4s, v28.8h, v9.8h\n"
+ "smlal v8.4s, v18.4h, v14.4h\n"
+ "smlal v10.4s, v11.4h, v4.4h\n"
+ "smlal2 v21.4s, v18.8h, v14.8h\n"
"smlal v3.4s, v23.4h, v7.4h\n"
- "ushll v8.8h, v8.8b, #0x0\n"
- "smlal v0.4s, v13.4h, v19.4h\n"
- "smlal v6.4s, v21.4h, v1.4h\n"
- "add x14, x14, #0x8\n"
- "smlal2 v9.4s, v11.8h, v4.8h\n"
- "smlal v28.4s, v13.4h, v17.4h\n"
"smlal2 v30.4s, v23.8h, v7.8h\n"
- "smlal2 v22.4s, v13.8h, v19.8h\n"
- "smlal2 v2.4s, v21.8h, v1.8h\n"
- "smlal v3.4s, v11.4h, v16.4h\n"
- "smlal v0.4s, v27.4h, v17.4h\n"
- "smlal v6.4s, v10.4h, v31.4h\n"
- "smlal2 v9.4s, v13.8h, v17.8h\n"
- "smlal v28.4s, v27.4h, v29.4h\n"
- "sqrdmulh v28.4s, v28.4s, v26.4s\n"
- "smlal2 v30.4s, v11.8h, v16.8h\n"
- "smlal2 v22.4s, v27.8h, v17.8h\n"
- "and v1.16b, v28.16b, v25.16b\n"
- "smlal2 v2.4s, v10.8h, v31.8h\n"
- "smlal v3.4s, v21.4h, v31.4h\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smlal v0.4s, v15.4h, v16.4h\n"
- "smlal v6.4s, v15.4h, v29.4h\n"
- "sqadd v28.4s, v28.4s, v1.4s\n"
- "smlal2 v9.4s, v27.8h, v29.8h\n"
- "smlal2 v30.4s, v21.8h, v31.8h\n"
- "sqrdmulh v9.4s, v9.4s, v24.4s\n"
- "smlal2 v22.4s, v15.8h, v16.8h\n"
- "smlal2 v2.4s, v15.8h, v29.8h\n"
- "and v27.16b, v9.16b, v20.16b\n"
- "smlal v3.4s, v10.4h, v4.4h\n"
- "smlal v0.4s, v8.4h, v4.4h\n"
- "sqrdmulh v3.4s, v3.4s, v26.4s\n"
- "smlal v6.4s, v8.4h, v16.4h\n"
- "smlal2 v30.4s, v10.8h, v4.8h\n"
- "sqrdmulh v0.4s, v0.4s, v26.4s\n"
- "smlal2 v22.4s, v8.8h, v4.8h\n"
- "smlal2 v2.4s, v8.8h, v16.8h\n"
- "sqrdmulh v6.4s, v6.4s, v26.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "sqrdmulh v30.4s, v30.4s, v24.4s\n"
- "and v4.16b, v0.16b, v25.16b\n"
- "sqrdmulh v22.4s, v22.4s, v24.4s\n"
- "and v17.16b, v6.16b, v25.16b\n"
- "sqrdmulh v2.4s, v2.4s, v24.4s\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v8.16b, v30.16b, v20.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v26.16b, v22.16b, v20.16b\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "and v11.16b, v2.16b, v20.16b\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v0.4s, v0.4s, v4.4s\n"
+ "smlal v20.4s, v19.4h, v7.4h\n"
+ "smlal2 v1.4s, v19.8h, v7.8h\n"
+ "smlal2 v27.4s, v11.8h, v4.8h\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v26.4h, v13.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "smlal v3.4s, v26.4h, v14.4h\n"
+ "smlal2 v30.4s, v26.8h, v14.8h\n"
+ "smlal v20.4s, v23.4h, v13.4h\n"
+ "smlal2 v1.4s, v23.8h, v13.8h\n"
+ "smlal2 v27.4s, v26.8h, v13.8h\n"
+ "smlal v8.4s, v11.4h, v2.4h\n"
+ "smlal v10.4s, v25.4h, v14.4h\n"
+ "smlal2 v21.4s, v11.8h, v2.8h\n"
+ "smlal v3.4s, v25.4h, v2.4h\n"
+ "smlal2 v30.4s, v25.8h, v2.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v6.4s\n"
+ "smlal2 v27.4s, v25.8h, v14.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "and v0.16b, v20.16b, v5.16b\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+ "and v24.16b, v1.16b, v31.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v16.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v22.16b, v8.16b, v5.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v6.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v26.16b, v3.16b, v5.16b\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "and v16.16b, v10.16b, v5.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v31.16b\n"
+ "sqadd v1.4s, v1.4s, v24.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v17.4s\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "srshl v28.4s, v28.4s, v25.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqadd v30.4s, v30.4s, v8.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqadd v22.4s, v22.4s, v26.4s\n"
- "srshl v6.4s, v6.4s, v25.4s\n"
- "sqadd v2.4s, v2.4s, v11.4s\n"
- "srshl v9.4s, v9.4s, v20.4s\n"
- "sqxtn v28.4h, v28.4s\n"
- "srshl v30.4s, v30.4s, v20.4s\n"
+ "and v7.16b, v30.16b, v31.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v9.16b, v27.16b, v31.16b\n"
+ "sqadd v8.4s, v8.4s, v22.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v26.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "srshl v8.4s, v8.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v5.4s\n"
+ "sqadd v30.4s, v30.4s, v7.4s\n"
+ "srshl v10.4s, v10.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v9.4s\n"
+ "srshl v1.4s, v1.4s, v31.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v30.4s, v30.4s, v31.4s\n"
"sqxtn v3.4h, v3.4s\n"
- "srshl v22.4s, v22.4s, v20.4s\n"
- "sqxtn v0.4h, v0.4s\n"
- "srshl v2.4s, v2.4s, v20.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v28.8h, v9.4s\n"
+ "srshl v27.4s, v27.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v21.4s\n"
"sqxtn2 v3.8h, v30.4s\n"
- "sqxtn2 v0.8h, v22.4s\n"
- "sqxtn2 v6.8h, v2.4s\n"
- "sqadd v28.8h, v28.8h, v5.8h\n"
- "sqadd v3.8h, v3.8h, v5.8h\n"
- "sqadd v0.8h, v0.8h, v5.8h\n"
- "sqadd v6.8h, v6.8h, v5.8h\n"
- "smax v28.8h, v28.8h, v14.8h\n"
- "smax v3.8h, v3.8h, v14.8h\n"
- "smax v0.8h, v0.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v28.8h, v28.8h, v12.8h\n"
- "smin v3.8h, v3.8h, v12.8h\n"
- "smin v0.8h, v0.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
- "str d28, [x28, x13]\n"
+ "sqxtn2 v10.8h, v27.4s\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v3.8h, v3.8h, v12.8h\n"
+ "sqadd v10.8h, v10.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v15.8h\n"
+ "smax v8.8h, v8.8h, v15.8h\n"
+ "smax v3.8h, v3.8h, v15.8h\n"
+ "smax v10.8h, v10.8h, v15.8h\n"
+ "smin v20.8h, v20.8h, v17.8h\n"
+ "smin v8.8h, v8.8h, v17.8h\n"
+ "smin v3.8h, v3.8h, v17.8h\n"
+ "smin v10.8h, v10.8h, v17.8h\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v3.16b, v3.16b, v3.16b\n"
- "uzp1 v0.16b, v0.16b, v0.16b\n"
- "str d3, [x27, x13]\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "str d0, [x26, x13]\n"
- "str d6, [x25, x13]\n"
- "add x13, x13, #0x8\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d20, [x9, x15]\n"
+ "str d8, [x28, x15]\n"
+ "str d3, [x27, x15]\n"
+ "str d10, [x26, x15]\n"
+ "add x15, x15, #0x8\n"
"beq 64f\n"
- "add x11, x11, #0x48\n"
+ "add x13, x13, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x16, #2, 5f\n"
- "ld1 { v28.4s }, [x20], #0x10\n"
- "tbz x16, #1, 4f\n"
- "ld1 { v9.d }[0], [x20], #0x8\n"
- "tbz x16, #0, 7f\n"
- "ld1 { v9.s }[2], [x20]\n"
+ "tbz x17, #2, 5f\n"
+ "ld1 { v20.4s }, [x20], #0x10\n"
+ "tbz x17, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x17, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x16, #0, 7f\n"
- "ld1 { v9.s }[0], [x20]\n"
+ "tbz x17, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x16, #1, 6f\n"
- "ld1 { v28.d }[0], [x20], #0x8\n"
- "tbz x16, #0, 7f\n"
- "ld1 { v28.s }[2], [x20]\n"
+ "tbz x17, #1, 6f\n"
+ "ld1 { v20.d }[0], [x20], #0x8\n"
+ "tbz x17, #0, 7f\n"
+ "ld1 { v20.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 7f\n"
- "ld1 { v28.s }[0], [x20]\n"
+ "tbz x17, #0, 7f\n"
+ "ld1 { v20.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d19, [x11, #0x0]\n"
- "ldr d7, [x11, #0x8]\n"
- "mov v3.16b, v28.16b\n"
- "mov v30.16b, v9.16b\n"
- "ldr d1, [x11, #0x10]\n"
- "ldr d17, [x11, #0x18]\n"
- "mov v0.16b, v28.16b\n"
- "mov v22.16b, v9.16b\n"
- "ldr d8, [x11, #0x20]\n"
- "ldr d31, [x11, #0x28]\n"
- "mov v6.16b, v28.16b\n"
- "mov v2.16b, v9.16b\n"
- "ldr d29, [x11, #0x30]\n"
- "ldr d16, [x11, #0x38]\n"
- "usubl v19.8h, v19.8b, v18.8b\n"
- "usubl v7.8h, v7.8b, v18.8b\n"
- "ldr d4, [x11, #0x40]\n"
- "ldp x24, x23, [x12, #0x0]\n"
- "usubl v1.8h, v1.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "ldp x22, x21, [x12, #0x10]\n"
- "ldr x20, [x12, #0x20]\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "add x24, x24, x14\n"
- "add x23, x23, x14\n"
- "add x22, x22, x14\n"
- "add x21, x21, x14\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 9f\n"
- "ld1 { v23.s }[0], [x24], #0x4\n"
- "ld1 { v10.s }[0], [x23], #0x4\n"
- "ld1 { v11.s }[0], [x22], #0x4\n"
- "ld1 { v13.s }[0], [x21], #0x4\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 8f\n"
- "ld1 { v23.h }[2], [x24], #0x2\n"
- "ld1 { v10.h }[2], [x23], #0x2\n"
- "ld1 { v11.h }[2], [x22], #0x2\n"
- "ld1 { v13.h }[2], [x21], #0x2\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 11f\n"
- "ld1 { v23.b }[6], [x24]\n"
- "ld1 { v10.b }[6], [x23]\n"
- "ld1 { v11.b }[6], [x22]\n"
- "ld1 { v13.b }[6], [x21]\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ldr d22, [x13, #0x0]\n"
+ "ldr d24, [x13, #0x8]\n"
+ "mov v8.16b, v20.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d9, [x13, #0x10]\n"
+ "ldr d7, [x13, #0x18]\n"
+ "mov v3.16b, v20.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d25, [x13, #0x20]\n"
+ "ldr d4, [x13, #0x28]\n"
+ "mov v10.16b, v20.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d13, [x13, #0x30]\n"
+ "ldr d14, [x13, #0x38]\n"
+ "usubl v22.8h, v22.8b, v29.8b\n"
+ "usubl v24.8h, v24.8b, v29.8b\n"
+ "ldr d2, [x13, #0x40]\n"
+ "ldp x24, x23, [x14, #0x0]\n"
+ "usubl v9.8h, v9.8b, v29.8b\n"
+ "usubl v7.8h, v7.8b, v29.8b\n"
+ "usubl v25.8h, v25.8b, v29.8b\n"
+ "usubl v4.8h, v4.8b, v29.8b\n"
+ "usubl v13.8h, v13.8b, v29.8b\n"
+ "usubl v14.8h, v14.8b, v29.8b\n"
+ "ldp x22, x21, [x14, #0x10]\n"
+ "usubl v2.8h, v2.8b, v29.8b\n"
+ "add x24, x24, x16\n"
+ "add x23, x23, x16\n"
+ "ldr x20, [x14, #0x20]\n"
+ "add x22, x22, x16\n"
+ "add x21, x21, x16\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 9f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v18.s }[0], [x22], #0x4\n"
+ "ld1 { v19.s }[0], [x21], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 8f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 11f\n"
+ "ld1 { v11.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x16, #0, 11f\n"
- "ld1 { v23.b }[4], [x24]\n"
- "ld1 { v10.b }[4], [x23]\n"
- "ld1 { v11.b }[4], [x22]\n"
- "ld1 { v13.b }[4], [x21]\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x17, #0, 11f\n"
+ "ld1 { v11.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x16, #1, 10f\n"
- "ld1 { v23.h }[0], [x24], #0x2\n"
- "ld1 { v10.h }[0], [x23], #0x2\n"
- "ld1 { v11.h }[0], [x22], #0x2\n"
- "ld1 { v13.h }[0], [x21], #0x2\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 11f\n"
- "ld1 { v23.b }[2], [x24]\n"
- "ld1 { v10.b }[2], [x23]\n"
- "ld1 { v11.b }[2], [x22]\n"
- "ld1 { v13.b }[2], [x21]\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x17, #1, 10f\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v18.h }[0], [x22], #0x2\n"
+ "ld1 { v19.h }[0], [x21], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 11f\n"
+ "ld1 { v11.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 11f\n"
- "ld1 { v23.b }[0], [x24]\n"
- "ld1 { v10.b }[0], [x23]\n"
- "ld1 { v11.b }[0], [x22]\n"
- "ld1 { v13.b }[0], [x21]\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x17, #0, 11f\n"
+ "ld1 { v11.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v18.b }[0], [x22]\n"
+ "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal v28.4s, v23.4h, v8.4h\n"
- "smlal2 v9.4s, v23.8h, v8.8h\n"
- "ldr x20, [x12, #0x28]\n"
- "smlal v3.4s, v23.4h, v17.4h\n"
- "smlal2 v30.4s, v23.8h, v17.8h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "smlal v0.4s, v23.4h, v7.4h\n"
- "smlal2 v22.4s, v23.8h, v7.8h\n"
- "add x20, x20, x14\n"
- "smlal v6.4s, v23.4h, v19.4h\n"
- "smlal2 v2.4s, v23.8h, v19.8h\n"
- "ushll v13.8h, v13.8b, #0x0\n"
- "smlal v28.4s, v10.4h, v19.4h\n"
- "smlal2 v9.4s, v10.8h, v19.8h\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "smlal v3.4s, v11.4h, v1.4h\n"
- "smlal2 v30.4s, v11.8h, v1.8h\n"
- "smlal v28.4s, v13.4h, v31.4h\n"
- "smlal2 v9.4s, v13.8h, v31.8h\n"
- "smlal v3.4s, v13.4h, v8.4h\n"
- "smlal2 v30.4s, v13.8h, v8.8h\n"
- "smlal v0.4s, v13.4h, v1.4h\n"
- "smlal2 v22.4s, v13.8h, v1.8h\n"
- "smlal v6.4s, v13.4h, v7.4h\n"
- "smlal2 v2.4s, v13.8h, v7.8h\n"
- "tbz x16, #2, 13f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 12f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 15f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x14, #0x28]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v20.4s, v11.4h, v25.4h\n"
+ "smlal2 v1.4s, v11.8h, v25.8h\n"
+ "smlal v8.4s, v11.4h, v7.4h\n"
+ "smlal2 v21.4s, v11.8h, v7.8h\n"
+ "add x20, x20, x16\n"
+ "smlal v3.4s, v11.4h, v24.4h\n"
+ "smlal2 v30.4s, v11.8h, v24.8h\n"
+ "smlal v10.4s, v11.4h, v22.4h\n"
+ "smlal2 v27.4s, v11.8h, v22.8h\n"
+ "smlal v20.4s, v28.4h, v22.4h\n"
+ "smlal2 v1.4s, v28.8h, v22.8h\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal2 v21.4s, v18.8h, v9.8h\n"
+ "smlal v3.4s, v19.4h, v9.4h\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal v10.4s, v19.4h, v24.4h\n"
+ "smlal2 v27.4s, v19.8h, v24.8h\n"
+ "smlal v20.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "smlal v8.4s, v19.4h, v25.4h\n"
+ "smlal2 v21.4s, v19.8h, v25.8h\n"
+ "tbz x17, #2, 13f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 12f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 15f\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x16, #0, 15f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x17, #0, 15f\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x16, #1, 14f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 15f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x17, #1, 14f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 15f\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 15f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x17, #0, 15f\n"
+ "ld1 { v6.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v0.4s, v26.4h, v29.4h\n"
- "smlal2 v22.4s, v26.8h, v29.8h\n"
- "ldr x20, [x12, #0x30]\n"
- "smlal v28.4s, v27.4h, v16.4h\n"
- "smlal2 v9.4s, v27.8h, v16.8h\n"
- "add x20, x20, x14\n"
- "smlal v3.4s, v27.4h, v29.4h\n"
- "smlal2 v30.4s, v27.8h, v29.8h\n"
- "smlal v0.4s, v27.4h, v8.4h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
- "smlal v6.4s, v27.4h, v17.4h\n"
- "smlal2 v2.4s, v27.8h, v17.8h\n"
- "tbz x16, #2, 17f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 16f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 19f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x14, #0x30]\n"
+ "smlal v20.4s, v23.4h, v14.4h\n"
+ "smlal2 v1.4s, v23.8h, v14.8h\n"
+ "smlal v8.4s, v23.4h, v13.4h\n"
+ "smlal2 v21.4s, v23.8h, v13.8h\n"
+ "smlal v10.4s, v23.4h, v7.4h\n"
+ "smlal2 v27.4s, v23.8h, v7.8h\n"
+ "smlal v3.4s, v6.4h, v13.4h\n"
+ "smlal2 v30.4s, v6.8h, v13.8h\n"
+ "add x20, x20, x16\n"
+ "smlal v3.4s, v23.4h, v25.4h\n"
+ "smlal2 v30.4s, v23.8h, v25.8h\n"
+ "tbz x17, #2, 17f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 16f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 19f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x16, #0, 19f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x17, #0, 19f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x16, #1, 18f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 19f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x17, #1, 18f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 19f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 19f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x17, #0, 19f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr x20, [x12, #0x38]\n"
- "smlal v6.4s, v23.4h, v4.4h\n"
- "smlal2 v2.4s, v23.8h, v4.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 21f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 20f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 23f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x14, #0x38]\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "smlal2 v27.4s, v26.8h, v2.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 21f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 20f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 23f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x16, #0, 23f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x17, #0, 23f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x16, #1, 22f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 23f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x17, #1, 22f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 23f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 23f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x17, #0, 23f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ushll v21.8h, v21.8b, #0x0\n"
- "ldr x20, [x12, #0x40]\n"
- "smlal v28.4s, v21.4h, v7.4h\n"
- "smlal2 v9.4s, v21.8h, v7.8h\n"
- "smlal v3.4s, v21.4h, v19.4h\n"
- "smlal2 v30.4s, v21.8h, v19.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 25f\n"
- "ld1 { v18.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 24f\n"
- "ld1 { v18.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 27f\n"
- "ld1 { v18.b }[6], [x20]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x20, [x14, #0x40]\n"
+ "smlal v20.4s, v31.4h, v24.4h\n"
+ "smlal2 v1.4s, v31.8h, v24.8h\n"
+ "smlal v8.4s, v31.4h, v22.4h\n"
+ "smlal2 v21.4s, v31.8h, v22.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 25f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 24f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 27f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x16, #0, 27f\n"
- "ld1 { v18.b }[4], [x20]\n"
+ "tbz x17, #0, 27f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x16, #1, 26f\n"
- "ld1 { v18.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 27f\n"
- "ld1 { v18.b }[2], [x20]\n"
+ "tbz x17, #1, 26f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 27f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 27f\n"
- "ld1 { v18.b }[0], [x20]\n"
+ "tbz x17, #0, 27f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ushll v18.8h, v18.8b, #0x0\n"
- "ldr x20, [x12, #0x48]\n"
- "smlal v28.4s, v18.4h, v1.4h\n"
- "smlal2 v9.4s, v18.8h, v1.8h\n"
- "smlal v3.4s, v18.4h, v7.4h\n"
- "smlal2 v30.4s, v18.8h, v7.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 29f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 28f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 31f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x14, #0x48]\n"
+ "smlal v20.4s, v26.4h, v9.4h\n"
+ "smlal2 v1.4s, v26.8h, v9.8h\n"
+ "smlal v8.4s, v26.4h, v24.4h\n"
+ "smlal2 v21.4s, v26.8h, v24.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 29f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 28f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 31f\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x16, #0, 31f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x17, #0, 31f\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x16, #1, 30f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 31f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x17, #1, 30f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 31f\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 31f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x17, #0, 31f\n"
+ "ld1 { v18.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x12, #0x50]\n"
- "smlal v28.4s, v15.4h, v4.4h\n"
- "smlal2 v9.4s, v15.8h, v4.8h\n"
- "smlal v3.4s, v15.4h, v16.4h\n"
- "smlal2 v30.4s, v15.8h, v16.8h\n"
- "add x20, x20, x14\n"
- "smlal v0.4s, v15.4h, v31.4h\n"
- "smlal2 v22.4s, v15.8h, v31.8h\n"
- "smlal v6.4s, v15.4h, v8.4h\n"
- "smlal2 v2.4s, v15.8h, v8.8h\n"
- "tbz x16, #2, 33f\n"
- "ld1 { v20.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 32f\n"
- "ld1 { v20.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 35f\n"
- "ld1 { v20.b }[6], [x20]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x14, #0x50]\n"
+ "smlal v20.4s, v18.4h, v2.4h\n"
+ "smlal2 v1.4s, v18.8h, v2.8h\n"
+ "smlal v8.4s, v18.4h, v14.4h\n"
+ "smlal2 v21.4s, v18.8h, v14.8h\n"
+ "smlal v3.4s, v18.4h, v4.4h\n"
+ "smlal2 v30.4s, v18.8h, v4.8h\n"
+ "smlal v10.4s, v18.4h, v25.4h\n"
+ "add x20, x20, x16\n"
+ "smlal2 v27.4s, v18.8h, v25.8h\n"
+ "tbz x17, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x16, #0, 35f\n"
- "ld1 { v20.b }[4], [x20]\n"
+ "tbz x17, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x16, #1, 34f\n"
- "ld1 { v20.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 35f\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "tbz x17, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 35f\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "tbz x17, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ushll v20.8h, v20.8b, #0x0\n"
- "ldr x20, [x12, #0x58]\n"
- "smlal v28.4s, v20.4h, v17.4h\n"
- "smlal2 v9.4s, v20.8h, v17.8h\n"
- "smlal v0.4s, v20.4h, v19.4h\n"
- "smlal2 v22.4s, v20.8h, v19.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 37f\n"
- "ld1 { v11.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 36f\n"
- "ld1 { v11.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 39f\n"
- "ld1 { v11.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x14, #0x58]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v1.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v22.4h\n"
+ "smlal2 v30.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 37f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 36f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 39f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x16, #0, 39f\n"
- "ld1 { v11.b }[4], [x20]\n"
+ "tbz x17, #0, 39f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x16, #1, 38f\n"
- "ld1 { v11.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 39f\n"
- "ld1 { v11.b }[2], [x20]\n"
+ "tbz x17, #1, 38f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 39f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 39f\n"
- "ld1 { v11.b }[0], [x20]\n"
+ "tbz x17, #0, 39f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v11.8h, v11.8b, #0x0\n"
- "ldr x20, [x12, #0x60]\n"
- "smlal v3.4s, v11.4h, v31.4h\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "smlal v6.4s, v11.4h, v1.4h\n"
- "smlal2 v2.4s, v11.8h, v1.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 41f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 40f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 43f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x14, #0x60]\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v21.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v26.4h, v9.4h\n"
+ "smlal2 v27.4s, v26.8h, v9.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 41f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 40f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 43f\n"
+ "ld1 { v5.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x16, #0, 43f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x17, #0, 43f\n"
+ "ld1 { v5.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x16, #1, 42f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 43f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x17, #1, 42f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 43f\n"
+ "ld1 { v5.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 43f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x17, #0, 43f\n"
+ "ld1 { v5.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr x20, [x12, #0x68]\n"
- "smlal v28.4s, v23.4h, v29.4h\n"
- "smlal2 v9.4s, v23.8h, v29.8h\n"
- "smlal v0.4s, v23.4h, v17.4h\n"
- "smlal2 v22.4s, v23.8h, v17.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 45f\n"
- "ld1 { v20.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 44f\n"
- "ld1 { v20.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 47f\n"
- "ld1 { v20.b }[6], [x20]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "ldr x20, [x14, #0x68]\n"
+ "smlal v20.4s, v5.4h, v13.4h\n"
+ "smlal2 v1.4s, v5.8h, v13.8h\n"
+ "smlal v3.4s, v5.4h, v7.4h\n"
+ "smlal2 v30.4s, v5.8h, v7.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 45f\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 44f\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 47f\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x16, #0, 47f\n"
- "ld1 { v20.b }[4], [x20]\n"
+ "tbz x17, #0, 47f\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x16, #1, 46f\n"
- "ld1 { v20.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 47f\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "tbz x17, #1, 46f\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 47f\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 47f\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "tbz x17, #0, 47f\n"
+ "ld1 { v19.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v20.8h, v20.8b, #0x0\n"
- "ldr x20, [x12, #0x70]\n"
- "smlal v3.4s, v20.4h, v4.4h\n"
- "smlal2 v30.4s, v20.8h, v4.8h\n"
- "smlal v6.4s, v20.4h, v31.4h\n"
- "smlal2 v2.4s, v20.8h, v31.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 49f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 48f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 51f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr x20, [x14, #0x70]\n"
+ "smlal v8.4s, v19.4h, v2.4h\n"
+ "smlal2 v21.4s, v19.8h, v2.8h\n"
+ "smlal v10.4s, v19.4h, v4.4h\n"
+ "smlal2 v27.4s, v19.8h, v4.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x16, #0, 51f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x17, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x16, #1, 50f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 51f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x17, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 51f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x17, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v8.8h, v8.8b, #0x0\n"
- "ldr x20, [x12, #0x78]\n"
- "smlal v0.4s, v8.4h, v16.4h\n"
- "smlal2 v22.4s, v8.8h, v16.8h\n"
- "smlal v6.4s, v8.4h, v29.4h\n"
- "smlal2 v2.4s, v8.8h, v29.8h\n"
- "add x20, x20, x14\n"
- "tbz x16, #2, 53f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x16, #1, 52f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x16, #0, 55f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x14, #0x78]\n"
+ "smlal v3.4s, v16.4h, v14.4h\n"
+ "smlal2 v30.4s, v16.8h, v14.8h\n"
+ "smlal v10.4s, v16.4h, v13.4h\n"
+ "smlal2 v27.4s, v16.8h, v13.8h\n"
+ "add x20, x20, x16\n"
+ "tbz x17, #2, 53f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x17, #1, 52f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x17, #0, 55f\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x16, #0, 55f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x17, #0, 55f\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x16, #1, 54f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x16, #0, 55f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x17, #1, 54f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x17, #0, 55f\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 55f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x17, #0, 55f\n"
+ "ld1 { v28.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v8.8h, v8.8b, #0x0\n"
- "smlal v0.4s, v8.4h, v4.4h\n"
- "smlal2 v22.4s, v8.8h, v4.8h\n"
- "smlal v6.4s, v8.4h, v16.4h\n"
- "smlal2 v2.4s, v8.8h, v16.8h\n"
- "tbz x16, #2, 57f\n"
- "ld1 { v7.4s }, [x10], #0x10\n"
- "ld1 { v23.4s }, [x9], #0x10\n"
- "tbz x16, #1, 56f\n"
- "ld1 { v11.d }[0], [x10], #0x8\n"
- "ld1 { v27.d }[0], [x9], #0x8\n"
- "tbz x16, #0, 59f\n"
- "ld1 { v11.s }[2], [x10]\n"
- "ld1 { v27.s }[2], [x9]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "smlal2 v30.4s, v28.8h, v2.8h\n"
+ "smlal v10.4s, v28.4h, v14.4h\n"
+ "smlal2 v27.4s, v28.8h, v14.8h\n"
+ "tbz x17, #2, 57f\n"
+ "ld1 { v29.4s }, [x12], #0x10\n"
+ "ld1 { v19.4s }, [x11], #0x10\n"
+ "tbz x17, #1, 56f\n"
+ "ld1 { v7.d }[0], [x12], #0x8\n"
+ "ld1 { v18.d }[0], [x11], #0x8\n"
+ "tbz x17, #0, 59f\n"
+ "ld1 { v7.s }[2], [x12]\n"
+ "ld1 { v18.s }[2], [x11]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x16, #0, 59f\n"
- "ld1 { v11.s }[0], [x10]\n"
- "ld1 { v27.s }[0], [x9]\n"
+ "tbz x17, #0, 59f\n"
+ "ld1 { v7.s }[0], [x12]\n"
+ "ld1 { v18.s }[0], [x11]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x16, #1, 58f\n"
- "ld1 { v7.d }[0], [x10], #0x8\n"
- "ld1 { v23.d }[0], [x9], #0x8\n"
- "tbz x16, #0, 59f\n"
- "ld1 { v7.s }[2], [x10]\n"
- "ld1 { v23.s }[2], [x9]\n"
+ "tbz x17, #1, 58f\n"
+ "ld1 { v29.d }[0], [x12], #0x8\n"
+ "ld1 { v19.d }[0], [x11], #0x8\n"
+ "tbz x17, #0, 59f\n"
+ "ld1 { v29.s }[2], [x12]\n"
+ "ld1 { v19.s }[2], [x11]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 59f\n"
- "ld1 { v7.s }[0], [x10]\n"
- "ld1 { v23.s }[0], [x9]\n"
+ "tbz x17, #0, 59f\n"
+ "ld1 { v29.s }[0], [x12]\n"
+ "ld1 { v19.s }[0], [x11]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v28.4s, v28.4s, v7.4s\n"
- "and v20.16b, v28.16b, v23.16b\n"
- "add x28, x28, x13\n"
- "add x27, x27, x13\n"
- "sqrdmulh v9.4s, v9.4s, v11.4s\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "add x26, x26, x13\n"
- "add x25, x25, x13\n"
- "and v4.16b, v9.16b, v27.16b\n"
- "sqrdmulh v3.4s, v3.4s, v7.4s\n"
- "sqrdmulh v0.4s, v0.4s, v7.4s\n"
- "sqrdmulh v6.4s, v6.4s, v7.4s\n"
- "sqadd v28.4s, v28.4s, v20.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v19.16b, v3.16b, v23.16b\n"
- "sqrdmulh v30.4s, v30.4s, v11.4s\n"
- "and v29.16b, v0.16b, v23.16b\n"
- "sqrdmulh v22.4s, v22.4s, v11.4s\n"
- "and v26.16b, v6.16b, v23.16b\n"
- "sqrdmulh v2.4s, v2.4s, v11.4s\n"
- "sqadd v9.4s, v9.4s, v4.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v17.16b, v30.16b, v27.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v8.16b, v22.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v7.4s\n"
+ "add x9, x9, x15\n"
+ "add x28, x28, x15\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v29.4s\n"
+ "add x27, x27, x15\n"
+ "add x26, x26, x15\n"
+ "sqrdmulh v10.4s, v10.4s, v29.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v7.4s\n"
+ "and v24.16b, v20.16b, v19.16b\n"
+ "and v28.16b, v1.16b, v18.16b\n"
+ "and v26.16b, v8.16b, v19.16b\n"
+ "and v0.16b, v3.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v7.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v13.16b, v2.16b, v27.16b\n"
- "sqadd v3.4s, v3.4s, v19.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v0.4s, v0.4s, v29.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "sqadd v6.4s, v6.4s, v26.4s\n"
- "sshr v13.4s, v13.4s, #0x1f\n"
- "srshl v28.4s, v28.4s, v23.4s\n"
- "srshl v3.4s, v3.4s, v23.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "srshl v0.4s, v0.4s, v23.4s\n"
- "sqadd v22.4s, v22.4s, v8.4s\n"
- "srshl v6.4s, v6.4s, v23.4s\n"
- "sqadd v2.4s, v2.4s, v13.4s\n"
- "srshl v9.4s, v9.4s, v27.4s\n"
- "sqxtn v28.4h, v28.4s\n"
- "srshl v30.4s, v30.4s, v27.4s\n"
+ "and v11.16b, v21.16b, v18.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v25.16b, v30.16b, v18.16b\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "and v5.16b, v10.16b, v19.16b\n"
+ "sqadd v1.4s, v1.4s, v28.4s\n"
+ "and v4.16b, v27.16b, v18.16b\n"
+ "sqadd v8.4s, v8.4s, v26.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v0.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v20.4s, v20.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v19.4s\n"
+ "sqadd v10.4s, v10.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v11.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v25.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "srshl v1.4s, v1.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v21.4s, v21.4s, v18.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v30.4s, v30.4s, v18.4s\n"
"sqxtn v3.4h, v3.4s\n"
- "srshl v22.4s, v22.4s, v27.4s\n"
- "sqxtn v0.4h, v0.4s\n"
- "srshl v2.4s, v2.4s, v27.4s\n"
- "sqxtn v6.4h, v6.4s\n"
- "sqxtn2 v28.8h, v9.4s\n"
+ "srshl v27.4s, v27.4s, v18.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v21.4s\n"
"sqxtn2 v3.8h, v30.4s\n"
- "sqxtn2 v0.8h, v22.4s\n"
- "sqxtn2 v6.8h, v2.4s\n"
- "sqadd v28.8h, v28.8h, v5.8h\n"
- "sqadd v3.8h, v3.8h, v5.8h\n"
- "sqadd v0.8h, v0.8h, v5.8h\n"
- "sqadd v6.8h, v6.8h, v5.8h\n"
- "smax v28.8h, v28.8h, v14.8h\n"
- "smax v3.8h, v3.8h, v14.8h\n"
- "smax v0.8h, v0.8h, v14.8h\n"
- "smax v6.8h, v6.8h, v14.8h\n"
- "smin v28.8h, v28.8h, v12.8h\n"
- "smin v3.8h, v3.8h, v12.8h\n"
- "smin v0.8h, v0.8h, v12.8h\n"
- "smin v6.8h, v6.8h, v12.8h\n"
- "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "sqxtn2 v10.8h, v27.4s\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v3.8h, v3.8h, v12.8h\n"
+ "sqadd v10.8h, v10.8h, v12.8h\n"
+ "smax v20.8h, v20.8h, v15.8h\n"
+ "smax v8.8h, v8.8h, v15.8h\n"
+ "smax v3.8h, v3.8h, v15.8h\n"
+ "smax v10.8h, v10.8h, v15.8h\n"
+ "smin v20.8h, v20.8h, v17.8h\n"
+ "smin v8.8h, v8.8h, v17.8h\n"
+ "smin v3.8h, v3.8h, v17.8h\n"
+ "smin v10.8h, v10.8h, v17.8h\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
"uzp1 v3.16b, v3.16b, v3.16b\n"
- "uzp1 v0.16b, v0.16b, v0.16b\n"
- "uzp1 v6.16b, v6.16b, v6.16b\n"
- "tbz x16, #2, 61f\n"
- "st1 { v28.s }[0], [x28], #0x4\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x17, #2, 61f\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v8.s }[0], [x28], #0x4\n"
"st1 { v3.s }[0], [x27], #0x4\n"
- "st1 { v0.s }[0], [x26], #0x4\n"
- "st1 { v6.s }[0], [x25], #0x4\n"
- "tbz x16, #1, 60f\n"
- "st1 { v28.h }[2], [x28], #0x2\n"
+ "st1 { v10.s }[0], [x26], #0x4\n"
+ "tbz x17, #1, 60f\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v8.h }[2], [x28], #0x2\n"
"st1 { v3.h }[2], [x27], #0x2\n"
- "st1 { v0.h }[2], [x26], #0x2\n"
- "st1 { v6.h }[2], [x25], #0x2\n"
- "tbz x16, #0, 63f\n"
- "st1 { v28.b }[6], [x28], #0x1\n"
+ "st1 { v10.h }[2], [x26], #0x2\n"
+ "tbz x17, #0, 63f\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v8.b }[6], [x28], #0x1\n"
"st1 { v3.b }[6], [x27], #0x1\n"
- "st1 { v0.b }[6], [x26], #0x1\n"
- "st1 { v6.b }[6], [x25], #0x1\n"
+ "st1 { v10.b }[6], [x26], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x16, #0, 63f\n"
- "st1 { v28.b }[4], [x28], #0x1\n"
+ "tbz x17, #0, 63f\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v8.b }[4], [x28], #0x1\n"
"st1 { v3.b }[4], [x27], #0x1\n"
- "st1 { v0.b }[4], [x26], #0x1\n"
- "st1 { v6.b }[4], [x25], #0x1\n"
+ "st1 { v10.b }[4], [x26], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x16, #1, 62f\n"
- "st1 { v28.h }[0], [x28], #0x2\n"
+ "tbz x17, #1, 62f\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v8.h }[0], [x28], #0x2\n"
"st1 { v3.h }[0], [x27], #0x2\n"
- "st1 { v0.h }[0], [x26], #0x2\n"
- "st1 { v6.h }[0], [x25], #0x2\n"
- "tbz x16, #0, 63f\n"
- "st1 { v28.b }[2], [x28], #0x1\n"
+ "st1 { v10.h }[0], [x26], #0x2\n"
+ "tbz x17, #0, 63f\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v8.b }[2], [x28], #0x1\n"
"st1 { v3.b }[2], [x27], #0x1\n"
- "st1 { v0.b }[2], [x26], #0x1\n"
- "st1 { v6.b }[2], [x25], #0x1\n"
+ "st1 { v10.b }[2], [x26], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x16, #0, 63f\n"
- "st1 { v28.b }[0], [x28], #0x1\n"
+ "tbz x17, #0, 63f\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v8.b }[0], [x28], #0x1\n"
"st1 { v3.b }[0], [x27], #0x1\n"
- "st1 { v0.b }[0], [x26], #0x1\n"
- "st1 { v6.b }[0], [x25], #0x1\n"
+ "st1 { v10.b }[0], [x26], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 6cb10a7bb2..7ebd3a5620 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,1292 +100,1292 @@ void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v6.16b }, [x20]\n"
+ "mov x4, #0x0\n"
+ "mov x5, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x16, x3, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v22.8h }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "ld1r { v12.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "mov x17, #0x0\n"
- "ld1r { v5.8h }, [x20]\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d12, [x14, #0x0]\n"
- "ldr d11, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "ldr d25, [x14, #0x10]\n"
- "ldr d24, [x14, #0x18]\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "ldr d23, [x14, #0x20]\n"
- "ldr d7, [x14, #0x28]\n"
- "usubl v24.8h, v24.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "ldr d3, [x14, #0x30]\n"
- "ldr d9, [x14, #0x38]\n"
- "usubl v7.8h, v7.8b, v6.8b\n"
- "usubl v3.8h, v3.8b, v6.8b\n"
- "ldr d30, [x14, #0x40]\n"
+ "ld1r { v6.8h }, [x20]\n"
+ "ldp x15, x14, [x22, #0x0]\n"
+ "ldp x13, x12, [x22, #0x10]\n"
+ "cbz x16, 3f\n"
+ "ldr d15, [x7, #0x0]\n"
+ "ldr d13, [x7, #0x8]\n"
+ "subs x16, x16, #0x1\n"
+ "ldr d28, [x7, #0x10]\n"
+ "ldr d11, [x7, #0x18]\n"
+ "ldr d23, [x7, #0x20]\n"
+ "ldr d17, [x7, #0x28]\n"
+ "ldr d10, [x7, #0x30]\n"
+ "ldr d2, [x7, #0x38]\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v13.8h, v13.8b, v16.8b\n"
+ "ldr d3, [x7, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v9.8h, v9.8b, v6.8b\n"
- "usubl v30.8h, v30.8b, v6.8b\n"
- "ldr q8, [x20, #0x0]\n"
- "ldr q2, [x20, #0x10]\n"
+ "usubl v28.8h, v28.8b, v16.8b\n"
+ "usubl v11.8h, v11.8b, v16.8b\n"
+ "usubl v23.8h, v23.8b, v16.8b\n"
+ "usubl v17.8h, v17.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "ldr q22, [x20, #0x0]\n"
+ "ldr q8, [x20, #0x10]\n"
+ "ldp x27, x26, [x6, #0x0]\n"
"add x20, x20, #0x20\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
+ "mov v9.16b, v22.16b\n"
+ "mov v31.16b, v8.16b\n"
+ "mov v20.16b, v22.16b\n"
"mov v21.16b, v8.16b\n"
- "mov v4.16b, v2.16b\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "mov v20.16b, v8.16b\n"
- "mov v1.16b, v2.16b\n"
- "ldr d26, [x27, x17]\n"
- "ldr d18, [x26, x17]\n"
- "mov v16.16b, v8.16b\n"
- "mov v14.16b, v2.16b\n"
- "ldr d10, [x25, x17]\n"
- "ldr d27, [x24, x17]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "ldr d17, [x23, x17]\n"
- "ldr d19, [x22, x17]\n"
- "ushll v10.8h, v10.8b, #0x0\n"
+ "ldp x25, x24, [x6, #0x10]\n"
+ "mov v18.16b, v22.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "ldp x23, x22, [x6, #0x20]\n"
+ "ldp x21, x20, [x6, #0x30]\n"
+ "ldr d29, [x27, x4]\n"
+ "ldr d25, [x26, x4]\n"
+ "ldr d0, [x25, x4]\n"
+ "ldr d7, [x24, x4]\n"
+ "ldr d24, [x23, x4]\n"
+ "ldr d27, [x22, x4]\n"
+ "ldr d26, [x21, x4]\n"
+ "ldr d1, [x20, x4]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr d15, [x21, x17]\n"
- "ldr d28, [x20, x17]\n"
- "ushll v17.8h, v17.8b, #0x0\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr q31, [x13, #0x0]\n"
- "ldr q0, [x12, #0x0]\n"
- "smlal v8.4s, v26.4h, v30.4h\n"
- "smlal2 v2.4s, v26.8h, v30.8h\n"
- "ldr q29, [x13, #0x10]\n"
- "ldr x21, [x15, #0x58]\n"
- "smlal v8.4s, v18.4h, v12.4h\n"
- "smlal v21.4s, v26.4h, v3.4h\n"
- "ldr x20, [x15, #0x78]\n"
- "ldr x25, [x15, #0x60]\n"
- "smlal v20.4s, v26.4h, v25.4h\n"
- "smlal v16.4s, v26.4h, v12.4h\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal2 v2.4s, v18.8h, v12.8h\n"
- "ldr d18, [x21, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "smlal v8.4s, v10.4h, v11.4h\n"
- "smlal2 v4.4s, v26.8h, v3.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v1.4s, v26.8h, v25.8h\n"
- "smlal2 v14.4s, v26.8h, v12.8h\n"
- "ldr d26, [x20, x17]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v21.4s, v27.4h, v11.4h\n"
- "smlal v20.4s, v18.4h, v24.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v16.4s, v26.4h, v23.4h\n"
- "smlal2 v2.4s, v10.8h, v11.8h\n"
- "ldr d10, [x25, x17]\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal v8.4s, v19.4h, v24.4h\n"
- "smlal2 v4.4s, v27.8h, v11.8h\n"
- "ldr d27, [x24, x17]\n"
+ "ldr q30, [x8, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "smlal v22.4s, v29.4h, v3.4h\n"
+ "smlal2 v8.4s, v29.8h, v3.8h\n"
+ "ldr q19, [x8, #0x10]\n"
+ "ldr x21, [x6, #0x58]\n"
+ "smlal v9.4s, v29.4h, v10.4h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "ldr x20, [x6, #0x78]\n"
+ "ldr x23, [x6, #0x60]\n"
+ "smlal v18.4s, v29.4h, v15.4h\n"
+ "smlal2 v31.4s, v29.8h, v10.8h\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v21.4s, v29.8h, v28.8h\n"
+ "smlal2 v5.4s, v29.8h, v15.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v22.4s, v25.4h, v15.4h\n"
+ "smlal2 v8.4s, v25.8h, v15.8h\n"
+ "ldr d25, [x21, x4]\n"
+ "ldr x21, [x6, #0x68]\n"
+ "ldr x11, [x6, #0x88]\n"
+ "smlal v9.4s, v7.4h, v13.4h\n"
+ "ldr x10, [x6, #0x40]\n"
+ "add x7, x7, #0x48\n"
+ "smlal2 v31.4s, v7.8h, v13.8h\n"
+ "ldr d7, [x20, x4]\n"
+ "ldr x20, [x6, #0x70]\n"
+ "subs x16, x16, #0x1\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x9, [x6, #0x98]\n"
+ "ldr x28, [x6, #0x50]\n"
+ "add x8, x8, #0x20\n"
+ "smlal v22.4s, v0.4h, v13.4h\n"
+ "smlal2 v8.4s, v0.8h, v13.8h\n"
+ "ldr d0, [x23, x4]\n"
+ "ldr x27, [x6, #0x48]\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v9.4s, v24.4h, v28.4h\n"
+ "ldr x26, [x6, #0x90]\n"
+ "ldr x25, [x6, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v11.4h\n"
+ "smlal2 v21.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x22, x4]\n"
+ "ldr x24, [x6, #0xa0]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal2 v31.4s, v24.8h, v28.8h\n"
+ "ldr d24, [x21, x4]\n"
+ "ldr x23, [x6, #0xb0]\n"
+ "smlal v18.4s, v7.4h, v23.4h\n"
+ "smlal v22.4s, v27.4h, v11.4h\n"
+ "ldr x22, [x6, #0xb8]\n"
+ "ldr x21, [x6, #0xc0]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v23.8h\n"
+ "ldr d7, [x11, x4]\n"
+ "smlal2 v8.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x10, x4]\n"
+ "smlal v20.4s, v0.4h, v15.4h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v21.4s, v0.8h, v15.8h\n"
+ "smlal v9.4s, v1.4h, v15.4h\n"
+ "smlal2 v31.4s, v1.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v18.4s, v25.4h, v13.4h\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v22.4s, v26.4h, v23.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal2 v5.4s, v25.8h, v13.8h\n"
+ "smlal2 v8.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x9, x4]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v1.4s, v18.8h, v24.8h\n"
- "ldr d18, [x23, x17]\n"
- "smlal2 v14.4s, v26.8h, v23.8h\n"
- "ldr d26, [x22, x17]\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v21.4s, v17.4h, v25.4h\n"
- "smlal v20.4s, v10.4h, v12.4h\n"
- "ldr x23, [x15, #0x50]\n"
- "smlal v16.4s, v27.4h, v11.4h\n"
- "smlal2 v2.4s, v19.8h, v24.8h\n"
- "ldr d19, [x21, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v8.4s, v15.4h, v23.4h\n"
- "smlal2 v4.4s, v17.8h, v25.8h\n"
- "ldr d17, [x20, x17]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal2 v1.4s, v10.8h, v12.8h\n"
- "smlal2 v14.4s, v27.8h, v11.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v28.4h, v12.4h\n"
- "smlal v20.4s, v18.4h, v23.4h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal v16.4s, v26.4h, v7.4h\n"
- "smlal2 v2.4s, v15.8h, v23.8h\n"
- "ldr d15, [x24, x17]\n"
- "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v20.4s, v24.4h, v23.4h\n"
"ushll v15.8h, v15.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v25.4h\n"
- "smlal2 v4.4s, v28.8h, v12.8h\n"
- "ldr d12, [x23, x17]\n"
- "ushll v12.8h, v12.8b, #0x0\n"
- "smlal2 v1.4s, v18.8h, v23.8h\n"
- "ldr d18, [x22, x17]\n"
- "smlal2 v14.4s, v26.8h, v7.8h\n"
- "ldr d26, [x21, x17]\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v21.4s, v19.4h, v23.4h\n"
- "smlal v20.4s, v17.4h, v11.4h\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v16.4s, v15.4h, v25.4h\n"
- "smlal2 v2.4s, v28.8h, v25.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v21.4s, v24.8h, v23.8h\n"
+ "ldr d24, [x28, x4]\n"
+ "smlal v18.4s, v7.4h, v17.4h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v8.4s, v12.4h, v7.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v4.4s, v19.8h, v23.8h\n"
- "ldr d23, [x22, x17]\n"
- "ldr d19, [x21, x17]\n"
- "smlal2 v1.4s, v17.8h, v11.8h\n"
- "ldr d11, [x20, x17]\n"
- "smlal2 v14.4s, v15.8h, v25.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal v21.4s, v18.4h, v7.4h\n"
- "smlal v20.4s, v26.4h, v3.4h\n"
+ "smlal v22.4s, v1.4h, v28.4h\n"
+ "smlal v9.4s, v27.4h, v23.4h\n"
+ "smlal2 v31.4s, v27.8h, v23.8h\n"
+ "ldr d27, [x27, x4]\n"
+ "ldr d23, [x26, x4]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x25, x4]\n"
+ "smlal2 v8.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x24, x4]\n"
+ "smlal v20.4s, v15.4h, v13.4h\n"
+ "smlal2 v21.4s, v15.8h, v13.8h\n"
+ "ldr d13, [x23, x4]\n"
+ "smlal v18.4s, v26.4h, v28.4h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "ldr x21, [x15, #0xc0]\n"
- "smlal v16.4s, v28.4h, v24.4h\n"
- "smlal2 v2.4s, v12.8h, v7.8h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal v8.4s, v10.4h, v3.4h\n"
- "smlal2 v4.4s, v18.8h, v7.8h\n"
- "ldr d18, [x21, x17]\n"
- "ushll v11.8h, v11.8b, #0x0\n"
- "smlal2 v1.4s, v26.8h, v3.8h\n"
- "smlal2 v14.4s, v28.8h, v24.8h\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "add x14, x14, #0x48\n"
- "smlal v21.4s, v12.4h, v24.4h\n"
- "smlal v20.4s, v23.4h, v9.4h\n"
- "add x17, x17, #0x8\n"
- "subs x8, x8, #0x1\n"
- "smlal v16.4s, v19.4h, v9.4h\n"
- "smlal2 v2.4s, v10.8h, v3.8h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v8.4s, v17.4h, v9.4h\n"
- "smlal2 v4.4s, v12.8h, v24.8h\n"
- "sqrdmulh v8.4s, v8.4s, v31.4s\n"
- "smlal2 v1.4s, v23.8h, v9.8h\n"
- "smlal2 v14.4s, v19.8h, v9.8h\n"
- "and v10.16b, v8.16b, v0.16b\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "smlal v16.4s, v11.4h, v3.4h\n"
- "smlal2 v2.4s, v17.8h, v9.8h\n"
- "sqrdmulh v2.4s, v2.4s, v29.4s\n"
- "smlal2 v4.4s, v27.8h, v9.8h\n"
- "smlal2 v1.4s, v28.8h, v7.8h\n"
- "and v12.16b, v2.16b, v25.16b\n"
- "smlal2 v14.4s, v11.8h, v3.8h\n"
- "smlal v21.4s, v15.4h, v30.4h\n"
- "sqrdmulh v21.4s, v21.4s, v31.4s\n"
- "smlal v20.4s, v11.4h, v30.4h\n"
- "smlal v16.4s, v18.4h, v30.4h\n"
- "sqrdmulh v20.4s, v20.4s, v31.4s\n"
- "smlal2 v4.4s, v15.8h, v30.8h\n"
- "smlal2 v1.4s, v11.8h, v30.8h\n"
- "sqrdmulh v16.4s, v16.4s, v31.4s\n"
- "smlal2 v14.4s, v18.8h, v30.8h\n"
- "sqadd v8.4s, v8.4s, v10.4s\n"
- "sshr v12.4s, v12.4s, #0x1f\n"
- "and v27.16b, v21.16b, v0.16b\n"
- "sqrdmulh v4.4s, v4.4s, v29.4s\n"
- "and v24.16b, v20.16b, v0.16b\n"
- "sqrdmulh v1.4s, v1.4s, v29.4s\n"
- "and v19.16b, v16.16b, v0.16b\n"
- "sqrdmulh v14.4s, v14.4s, v29.4s\n"
- "sqadd v2.4s, v2.4s, v12.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v18.16b, v4.16b, v25.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "and v17.16b, v1.16b, v25.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v15.16b, v14.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v24.4s\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v22.4s, v24.4h, v17.4h\n"
+ "smlal2 v5.4s, v26.8h, v28.8h\n"
+ "ldr d28, [x22, x4]\n"
+ "smlal2 v8.4s, v24.8h, v17.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal v9.4s, v27.4h, v17.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "smlal2 v31.4s, v27.8h, v17.8h\n"
+ "ldr d27, [x21, x4]\n"
+ "smlal v20.4s, v23.4h, v10.4h\n"
+ "smlal v18.4s, v7.4h, v11.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v22.4s, v0.4h, v10.4h\n"
+ "smlal2 v21.4s, v23.8h, v10.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v11.8h\n"
+ "smlal2 v8.4s, v0.8h, v10.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v9.4s, v24.4h, v11.4h\n"
+ "smlal2 v31.4s, v24.8h, v11.8h\n"
+ "smlal v20.4s, v1.4h, v2.4h\n"
+ "smlal v18.4s, v13.4h, v2.4h\n"
+ "smlal v22.4s, v15.4h, v2.4h\n"
+ "smlal2 v21.4s, v1.8h, v2.8h\n"
+ "smlal2 v5.4s, v13.8h, v2.8h\n"
+ "smlal2 v8.4s, v15.8h, v2.8h\n"
+ "smlal v9.4s, v25.4h, v2.4h\n"
+ "smlal2 v31.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v7.4h, v17.4h\n"
+ "smlal v18.4s, v28.4h, v10.4h\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "smlal2 v21.4s, v7.8h, v17.8h\n"
+ "smlal2 v5.4s, v28.8h, v10.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "and v17.16b, v22.16b, v4.16b\n"
+ "smlal2 v31.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal v18.4s, v27.4h, v3.4h\n"
+ "and v26.16b, v8.16b, v29.16b\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v3.8h\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "srshl v8.4s, v8.4s, v0.4s\n"
- "srshl v21.4s, v21.4s, v0.4s\n"
- "sqadd v4.4s, v4.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v0.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v0.4s\n"
- "sqadd v14.4s, v14.4s, v15.4s\n"
- "srshl v2.4s, v2.4s, v25.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v4.4s, v4.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v30.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v25.16b, v9.16b, v4.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "and v0.16b, v20.16b, v4.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v19.4s\n"
+ "and v1.16b, v18.16b, v4.16b\n"
+ "sqadd v8.4s, v8.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v11.16b, v31.16b, v29.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v21.16b, v29.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v30.16b, v5.16b, v29.16b\n"
+ "sqadd v9.4s, v9.4s, v25.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "srshl v9.4s, v9.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v11.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v4.4s\n"
+ "sqadd v5.4s, v5.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v31.4s, v31.4s, v29.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v14.4s, v14.4s, v25.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "sqxtn2 v8.8h, v2.4s\n"
- "sqxtn2 v21.8h, v4.4s\n"
- "sqxtn2 v20.8h, v1.4s\n"
- "sqxtn2 v16.8h, v14.4s\n"
- "sqadd v8.8h, v8.8h, v22.8h\n"
- "sqadd v21.8h, v21.8h, v22.8h\n"
- "sqadd v20.8h, v20.8h, v22.8h\n"
- "sqadd v16.8h, v16.8h, v22.8h\n"
- "smax v8.8h, v8.8h, v13.8h\n"
- "smax v21.8h, v21.8h, v13.8h\n"
- "smax v20.8h, v20.8h, v13.8h\n"
- "smax v16.8h, v16.8h, v13.8h\n"
- "smin v8.8h, v8.8h, v5.8h\n"
- "smin v21.8h, v21.8h, v5.8h\n"
- "smin v20.8h, v20.8h, v5.8h\n"
- "smin v16.8h, v16.8h, v5.8h\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d8, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v22.8h, v8.4s\n"
+ "sqxtn2 v9.8h, v31.4s\n"
+ "sqxtn2 v20.8h, v21.4s\n"
+ "sqxtn2 v18.8h, v5.4s\n"
+ "sqadd v22.8h, v22.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v18.8h, v18.8h, v12.8h\n"
+ "smax v22.8h, v22.8h, v14.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v18.8h, v18.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v6.8h\n"
+ "smin v9.8h, v9.8h, v6.8h\n"
+ "smin v20.8h, v20.8h, v6.8h\n"
+ "smin v18.8h, v18.8h, v6.8h\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d20, [x9, x16]\n"
- "str d16, [x28, x16]\n"
- "ldr q8, [x20, #0x0]\n"
- "ldr q2, [x20, #0x10]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d22, [x15, x5]\n"
+ "str d9, [x14, x5]\n"
+ "str d20, [x13, x5]\n"
+ "str d18, [x12, x5]\n"
+ "add x5, x5, #0x8\n"
+ "ldr q22, [x20, #0x0]\n"
+ "ldr q8, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d12, [x14, #0x0]\n"
- "ldr d11, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d25, [x14, #0x10]\n"
- "ldr d24, [x14, #0x18]\n"
+ "ldr d15, [x7, #0x0]\n"
+ "ldr d13, [x7, #0x8]\n"
+ "ldr d28, [x7, #0x10]\n"
+ "ldr d11, [x7, #0x18]\n"
+ "ldr d23, [x7, #0x20]\n"
+ "ldr d17, [x7, #0x28]\n"
+ "mov v9.16b, v22.16b\n"
+ "mov v31.16b, v8.16b\n"
+ "ldr d10, [x7, #0x30]\n"
+ "ldr d2, [x7, #0x38]\n"
+ "mov v20.16b, v22.16b\n"
"mov v21.16b, v8.16b\n"
- "mov v4.16b, v2.16b\n"
- "ldr d23, [x14, #0x20]\n"
- "ldr d7, [x14, #0x28]\n"
- "mov v20.16b, v8.16b\n"
- "mov v1.16b, v2.16b\n"
- "ldr d3, [x14, #0x30]\n"
- "ldr d9, [x14, #0x38]\n"
- "mov v16.16b, v8.16b\n"
- "mov v14.16b, v2.16b\n"
- "ldr d30, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v24.8h, v24.8b, v6.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ldr d26, [x27, x17]\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v7.8h, v7.8b, v6.8b\n"
- "ldr d18, [x26, x17]\n"
- "ldr d10, [x25, x17]\n"
- "usubl v3.8h, v3.8b, v6.8b\n"
- "usubl v9.8h, v9.8b, v6.8b\n"
- "ldr d27, [x24, x17]\n"
- "ldr d17, [x23, x17]\n"
- "usubl v30.8h, v30.8b, v6.8b\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "ldr d19, [x22, x17]\n"
- "ldr d15, [x21, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr d28, [x20, x17]\n"
+ "ldr d3, [x7, #0x40]\n"
+ "ldp x27, x26, [x6, #0x0]\n"
+ "mov v18.16b, v22.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v13.8h, v13.8b, v16.8b\n"
+ "usubl v28.8h, v28.8b, v16.8b\n"
+ "usubl v11.8h, v11.8b, v16.8b\n"
+ "ldp x25, x24, [x6, #0x10]\n"
+ "usubl v23.8h, v23.8b, v16.8b\n"
+ "usubl v17.8h, v17.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "ldp x23, x22, [x6, #0x20]\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "ldp x21, x20, [x6, #0x30]\n"
+ "ldr d29, [x27, x4]\n"
+ "ldr d25, [x26, x4]\n"
+ "ldr d0, [x25, x4]\n"
+ "ldr d7, [x24, x4]\n"
+ "ldr d24, [x23, x4]\n"
+ "ldr d27, [x22, x4]\n"
+ "ldr d26, [x21, x4]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d1, [x20, x4]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ushll v17.8h, v17.8b, #0x0\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q0, [x13, #0x0]\n"
- "ldr q31, [x12, #0x0]\n"
- "smlal v8.4s, v26.4h, v30.4h\n"
- "smlal2 v2.4s, v26.8h, v30.8h\n"
- "ldr q29, [x13, #0x10]\n"
- "ldr x21, [x15, #0x58]\n"
- "smlal v8.4s, v18.4h, v12.4h\n"
- "smlal v21.4s, v26.4h, v3.4h\n"
- "ldr x20, [x15, #0x78]\n"
- "ldr x25, [x15, #0x60]\n"
- "smlal v20.4s, v26.4h, v25.4h\n"
- "smlal v16.4s, v26.4h, v12.4h\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal2 v2.4s, v18.8h, v12.8h\n"
- "ldr d18, [x21, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "smlal v8.4s, v10.4h, v11.4h\n"
- "smlal2 v4.4s, v26.8h, v3.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v1.4s, v26.8h, v25.8h\n"
- "smlal2 v14.4s, v26.8h, v12.8h\n"
- "ldr d26, [x20, x17]\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v21.4s, v27.4h, v11.4h\n"
- "smlal v20.4s, v18.4h, v24.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v16.4s, v26.4h, v23.4h\n"
- "smlal2 v2.4s, v10.8h, v11.8h\n"
- "ldr d10, [x25, x17]\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal v8.4s, v19.4h, v24.4h\n"
- "smlal2 v4.4s, v27.8h, v11.8h\n"
- "ldr d27, [x24, x17]\n"
+ "ldr q30, [x8, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "smlal v22.4s, v29.4h, v3.4h\n"
+ "smlal2 v8.4s, v29.8h, v3.8h\n"
+ "ldr q19, [x8, #0x10]\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal v9.4s, v29.4h, v10.4h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "ldr x23, [x6, #0x78]\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v18.4s, v29.4h, v15.4h\n"
+ "smlal2 v31.4s, v29.8h, v10.8h\n"
+ "ldr x21, [x6, #0x80]\n"
+ "smlal2 v21.4s, v29.8h, v28.8h\n"
+ "smlal2 v5.4s, v29.8h, v15.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v22.4s, v25.4h, v15.4h\n"
+ "smlal2 v8.4s, v25.8h, v15.8h\n"
+ "ldr d25, [x20, x4]\n"
+ "ldr x20, [x6, #0x68]\n"
+ "ldr x11, [x6, #0x88]\n"
+ "smlal v9.4s, v7.4h, v13.4h\n"
+ "ldr x10, [x6, #0x40]\n"
+ "tst x3, #0x7\n"
+ "smlal2 v31.4s, v7.8h, v13.8h\n"
+ "ldr d7, [x23, x4]\n"
+ "ldr x9, [x6, #0x70]\n"
+ "add x8, x8, #0x20\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x28, [x6, #0x98]\n"
+ "ldr x27, [x6, #0x50]\n"
+ "add x17, x17, #0x20\n"
+ "smlal v22.4s, v0.4h, v13.4h\n"
+ "smlal2 v8.4s, v0.8h, v13.8h\n"
+ "ldr d0, [x22, x4]\n"
+ "ldr x26, [x6, #0x48]\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v9.4s, v24.4h, v28.4h\n"
+ "ldr x25, [x6, #0x90]\n"
+ "ldr x24, [x6, #0xa8]\n"
+ "smlal v20.4s, v25.4h, v11.4h\n"
+ "smlal2 v21.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x21, x4]\n"
+ "ldr x23, [x6, #0xa0]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal2 v31.4s, v24.8h, v28.8h\n"
+ "ldr d24, [x20, x4]\n"
+ "ldr x22, [x6, #0xb0]\n"
+ "smlal v18.4s, v7.4h, v23.4h\n"
+ "smlal v22.4s, v27.4h, v11.4h\n"
+ "ldr x21, [x6, #0xb8]\n"
+ "ldr x20, [x6, #0xc0]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v23.8h\n"
+ "ldr d7, [x11, x4]\n"
+ "smlal2 v8.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x10, x4]\n"
+ "smlal v20.4s, v0.4h, v15.4h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v21.4s, v0.8h, v15.8h\n"
+ "smlal v9.4s, v1.4h, v15.4h\n"
+ "smlal2 v31.4s, v1.8h, v15.8h\n"
+ "ldr d15, [x9, x4]\n"
+ "smlal v18.4s, v25.4h, v13.4h\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v22.4s, v26.4h, v23.4h\n"
+ "smlal2 v5.4s, v25.8h, v13.8h\n"
+ "smlal2 v8.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x28, x4]\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "smlal2 v1.4s, v18.8h, v24.8h\n"
- "ldr d18, [x23, x17]\n"
- "smlal2 v14.4s, v26.8h, v23.8h\n"
- "ldr d26, [x22, x17]\n"
- "ldr x24, [x15, #0x98]\n"
- "smlal v21.4s, v17.4h, v25.4h\n"
- "smlal v20.4s, v10.4h, v12.4h\n"
- "ldr x23, [x15, #0x50]\n"
- "smlal v16.4s, v27.4h, v11.4h\n"
- "smlal2 v2.4s, v19.8h, v24.8h\n"
- "ldr d19, [x21, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v8.4s, v15.4h, v23.4h\n"
- "smlal2 v4.4s, v17.8h, v25.8h\n"
- "ldr d17, [x20, x17]\n"
- "ldr x22, [x15, #0x48]\n"
- "smlal2 v1.4s, v10.8h, v12.8h\n"
- "smlal2 v14.4s, v27.8h, v11.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v28.4h, v12.4h\n"
- "smlal v20.4s, v18.4h, v23.4h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal v16.4s, v26.4h, v7.4h\n"
- "smlal2 v2.4s, v15.8h, v23.8h\n"
- "ldr d15, [x24, x17]\n"
- "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v20.4s, v24.4h, v23.4h\n"
"ushll v15.8h, v15.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v25.4h\n"
- "smlal2 v4.4s, v28.8h, v12.8h\n"
- "ldr d12, [x23, x17]\n"
- "ushll v12.8h, v12.8b, #0x0\n"
- "smlal2 v1.4s, v18.8h, v23.8h\n"
- "ldr d18, [x22, x17]\n"
- "smlal2 v14.4s, v26.8h, v7.8h\n"
- "ldr d26, [x21, x17]\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v21.4s, v19.4h, v23.4h\n"
- "smlal v20.4s, v17.4h, v11.4h\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v16.4s, v15.4h, v25.4h\n"
- "smlal2 v2.4s, v28.8h, v25.8h\n"
- "ldr d28, [x20, x17]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v21.4s, v24.8h, v23.8h\n"
+ "ldr d24, [x27, x4]\n"
+ "smlal v18.4s, v7.4h, v17.4h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "smlal v8.4s, v12.4h, v7.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v4.4s, v19.8h, v23.8h\n"
- "ldr d23, [x22, x17]\n"
- "ldr d19, [x21, x17]\n"
- "smlal2 v1.4s, v17.8h, v11.8h\n"
- "ldr d11, [x20, x17]\n"
- "smlal2 v14.4s, v15.8h, v25.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal v21.4s, v18.4h, v7.4h\n"
- "smlal v20.4s, v26.4h, v3.4h\n"
+ "smlal v22.4s, v1.4h, v28.4h\n"
+ "smlal v9.4s, v27.4h, v23.4h\n"
+ "smlal2 v31.4s, v27.8h, v23.8h\n"
+ "ldr d27, [x26, x4]\n"
+ "ldr d23, [x25, x4]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x24, x4]\n"
+ "smlal2 v8.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x23, x4]\n"
+ "smlal v20.4s, v15.4h, v13.4h\n"
+ "smlal2 v21.4s, v15.8h, v13.8h\n"
+ "ldr d13, [x22, x4]\n"
+ "smlal v18.4s, v26.4h, v28.4h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal v16.4s, v28.4h, v24.4h\n"
- "smlal2 v2.4s, v12.8h, v7.8h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "tst x7, #0x7\n"
- "smlal v8.4s, v10.4h, v3.4h\n"
- "smlal2 v4.4s, v18.8h, v7.8h\n"
- "ldr d18, [x20, x17]\n"
- "ushll v11.8h, v11.8b, #0x0\n"
- "smlal2 v1.4s, v26.8h, v3.8h\n"
- "smlal2 v14.4s, v28.8h, v24.8h\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "add x17, x17, #0x8\n"
- "smlal v21.4s, v12.4h, v24.4h\n"
- "smlal v20.4s, v23.4h, v9.4h\n"
- "add x13, x13, #0x20\n"
- "add x12, x12, #0x20\n"
- "smlal v16.4s, v19.4h, v9.4h\n"
- "smlal2 v2.4s, v10.8h, v3.8h\n"
- "smlal v8.4s, v17.4h, v9.4h\n"
- "smlal2 v4.4s, v12.8h, v24.8h\n"
- "sqrdmulh v8.4s, v8.4s, v0.4s\n"
- "smlal2 v1.4s, v23.8h, v9.8h\n"
- "smlal2 v14.4s, v19.8h, v9.8h\n"
- "and v23.16b, v8.16b, v31.16b\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "smlal v16.4s, v11.4h, v3.4h\n"
- "smlal2 v2.4s, v17.8h, v9.8h\n"
- "sqrdmulh v2.4s, v2.4s, v29.4s\n"
- "smlal2 v4.4s, v27.8h, v9.8h\n"
- "smlal2 v1.4s, v28.8h, v7.8h\n"
- "and v7.16b, v2.16b, v25.16b\n"
- "smlal2 v14.4s, v11.8h, v3.8h\n"
- "smlal v21.4s, v15.4h, v30.4h\n"
- "sqrdmulh v21.4s, v21.4s, v0.4s\n"
- "smlal v20.4s, v11.4h, v30.4h\n"
- "smlal v16.4s, v18.4h, v30.4h\n"
- "sqrdmulh v20.4s, v20.4s, v0.4s\n"
- "smlal2 v4.4s, v15.8h, v30.8h\n"
- "smlal2 v1.4s, v11.8h, v30.8h\n"
- "sqrdmulh v16.4s, v16.4s, v0.4s\n"
- "smlal2 v14.4s, v18.8h, v30.8h\n"
- "sqadd v8.4s, v8.4s, v23.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "and v23.16b, v21.16b, v31.16b\n"
- "sqrdmulh v4.4s, v4.4s, v29.4s\n"
- "and v24.16b, v20.16b, v31.16b\n"
- "sqrdmulh v1.4s, v1.4s, v29.4s\n"
- "and v19.16b, v16.16b, v31.16b\n"
- "sqrdmulh v14.4s, v14.4s, v29.4s\n"
- "sqadd v2.4s, v2.4s, v7.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v18.16b, v4.16b, v25.16b\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
+ "smlal v22.4s, v24.4h, v17.4h\n"
+ "smlal2 v5.4s, v26.8h, v28.8h\n"
+ "ldr d28, [x21, x4]\n"
+ "smlal2 v8.4s, v24.8h, v17.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal v9.4s, v27.4h, v17.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "smlal2 v31.4s, v27.8h, v17.8h\n"
+ "ldr d27, [x20, x4]\n"
+ "smlal v20.4s, v23.4h, v10.4h\n"
+ "smlal v18.4s, v7.4h, v11.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v22.4s, v0.4h, v10.4h\n"
+ "smlal2 v21.4s, v23.8h, v10.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v5.4s, v7.8h, v11.8h\n"
+ "smlal2 v8.4s, v0.8h, v10.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v9.4s, v24.4h, v11.4h\n"
+ "smlal2 v31.4s, v24.8h, v11.8h\n"
+ "smlal v20.4s, v1.4h, v2.4h\n"
+ "smlal v18.4s, v13.4h, v2.4h\n"
+ "smlal v22.4s, v15.4h, v2.4h\n"
+ "smlal2 v21.4s, v1.8h, v2.8h\n"
+ "smlal2 v5.4s, v13.8h, v2.8h\n"
+ "smlal2 v8.4s, v15.8h, v2.8h\n"
+ "smlal v9.4s, v25.4h, v2.4h\n"
+ "smlal2 v31.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v7.4h, v17.4h\n"
+ "smlal v18.4s, v28.4h, v10.4h\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "smlal2 v21.4s, v7.8h, v17.8h\n"
+ "smlal2 v5.4s, v28.8h, v10.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "smlal v9.4s, v26.4h, v3.4h\n"
+ "and v17.16b, v22.16b, v4.16b\n"
+ "smlal2 v31.4s, v26.8h, v3.8h\n"
+ "smlal v20.4s, v28.4h, v3.4h\n"
+ "smlal v18.4s, v27.4h, v3.4h\n"
+ "and v15.16b, v8.16b, v29.16b\n"
+ "smlal2 v21.4s, v28.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v3.8h\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v9.4s, v9.4s, v30.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v30.4s\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v25.16b, v9.16b, v4.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "and v24.16b, v20.16b, v4.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v19.4s\n"
+ "and v23.16b, v18.16b, v4.16b\n"
+ "sqadd v8.4s, v8.4s, v15.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v30.16b, v31.16b, v29.16b\n"
"sshr v24.4s, v24.4s, #0x1f\n"
- "and v17.16b, v1.16b, v25.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v15.16b, v14.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v23.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v17.16b, v21.16b, v29.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v7.16b, v5.16b, v29.16b\n"
+ "sqadd v9.4s, v9.4s, v25.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
"sqadd v20.4s, v20.4s, v24.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "srshl v8.4s, v8.4s, v31.4s\n"
- "srshl v21.4s, v21.4s, v31.4s\n"
- "sqadd v4.4s, v4.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v31.4s\n"
- "sqadd v1.4s, v1.4s, v17.4s\n"
- "srshl v16.4s, v16.4s, v31.4s\n"
- "sqadd v14.4s, v14.4s, v15.4s\n"
- "srshl v2.4s, v2.4s, v25.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v4.4s, v4.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqadd v18.4s, v18.4s, v23.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "srshl v9.4s, v9.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v30.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v4.4s\n"
+ "sqadd v5.4s, v5.4s, v7.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v31.4s, v31.4s, v29.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v14.4s, v14.4s, v25.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "sqxtn2 v8.8h, v2.4s\n"
- "sqxtn2 v21.8h, v4.4s\n"
- "sqxtn2 v20.8h, v1.4s\n"
- "sqxtn2 v16.8h, v14.4s\n"
- "sqadd v8.8h, v8.8h, v22.8h\n"
- "sqadd v21.8h, v21.8h, v22.8h\n"
- "sqadd v20.8h, v20.8h, v22.8h\n"
- "sqadd v16.8h, v16.8h, v22.8h\n"
- "smax v8.8h, v8.8h, v13.8h\n"
- "smax v21.8h, v21.8h, v13.8h\n"
- "smax v20.8h, v20.8h, v13.8h\n"
- "smax v16.8h, v16.8h, v13.8h\n"
- "smin v8.8h, v8.8h, v5.8h\n"
- "smin v21.8h, v21.8h, v5.8h\n"
- "smin v20.8h, v20.8h, v5.8h\n"
- "smin v16.8h, v16.8h, v5.8h\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d8, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v22.8h, v8.4s\n"
+ "sqxtn2 v9.8h, v31.4s\n"
+ "sqxtn2 v20.8h, v21.4s\n"
+ "sqxtn2 v18.8h, v5.4s\n"
+ "sqadd v22.8h, v22.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v18.8h, v18.8h, v12.8h\n"
+ "smax v22.8h, v22.8h, v14.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v18.8h, v18.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v6.8h\n"
+ "smin v9.8h, v9.8h, v6.8h\n"
+ "smin v20.8h, v20.8h, v6.8h\n"
+ "smin v18.8h, v18.8h, v6.8h\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d20, [x9, x16]\n"
- "str d16, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d22, [x15, x5]\n"
+ "str d9, [x14, x5]\n"
+ "str d20, [x13, x5]\n"
+ "str d18, [x12, x5]\n"
+ "add x5, x5, #0x8\n"
"beq 88f\n"
- "add x14, x14, #0x48\n"
+ "add x7, x7, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v8.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v2.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v2.s }[2], [x20]\n"
+ "tbz x3, #2, 5f\n"
+ "ld1 { v22.4s }, [x20], #0x10\n"
+ "tbz x3, #1, 4f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v2.s }[0], [x20]\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v8.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v8.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v8.s }[2], [x20]\n"
+ "tbz x3, #1, 6f\n"
+ "ld1 { v22.d }[0], [x20], #0x8\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v22.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v8.s }[0], [x20]\n"
+ "tbz x3, #0, 7f\n"
+ "ld1 { v22.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d12, [x14, #0x0]\n"
- "ldr d11, [x14, #0x8]\n"
+ "ldr d15, [x7, #0x0]\n"
+ "ldr d13, [x7, #0x8]\n"
+ "mov v9.16b, v22.16b\n"
+ "mov v31.16b, v8.16b\n"
+ "ldr d28, [x7, #0x10]\n"
+ "ldr d11, [x7, #0x18]\n"
+ "mov v20.16b, v22.16b\n"
"mov v21.16b, v8.16b\n"
- "mov v4.16b, v2.16b\n"
- "ldr d25, [x14, #0x10]\n"
- "ldr d24, [x14, #0x18]\n"
- "mov v20.16b, v8.16b\n"
- "mov v1.16b, v2.16b\n"
- "ldr d23, [x14, #0x20]\n"
- "ldr d7, [x14, #0x28]\n"
- "mov v16.16b, v8.16b\n"
- "mov v14.16b, v2.16b\n"
- "ldr d3, [x14, #0x30]\n"
- "ldr d9, [x14, #0x38]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "ldr d30, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v24.8h, v24.8b, v6.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v7.8h, v7.8b, v6.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "usubl v3.8h, v3.8b, v6.8b\n"
- "usubl v9.8h, v9.8b, v6.8b\n"
- "usubl v30.8h, v30.8b, v6.8b\n"
- "add x27, x27, x17\n"
- "add x26, x26, x17\n"
- "add x25, x25, x17\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v26.s }[0], [x27], #0x4\n"
- "ld1 { v18.s }[0], [x26], #0x4\n"
- "ld1 { v10.s }[0], [x25], #0x4\n"
- "ld1 { v27.s }[0], [x24], #0x4\n"
- "ld1 { v17.s }[0], [x23], #0x4\n"
- "ld1 { v19.s }[0], [x22], #0x4\n"
- "ld1 { v15.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v26.h }[2], [x27], #0x2\n"
- "ld1 { v18.h }[2], [x26], #0x2\n"
- "ld1 { v10.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v17.h }[2], [x23], #0x2\n"
- "ld1 { v19.h }[2], [x22], #0x2\n"
- "ld1 { v15.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v26.b }[6], [x27]\n"
- "ld1 { v18.b }[6], [x26]\n"
- "ld1 { v10.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v17.b }[6], [x23]\n"
- "ld1 { v19.b }[6], [x22]\n"
- "ld1 { v15.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ldr d23, [x7, #0x20]\n"
+ "ldr d17, [x7, #0x28]\n"
+ "mov v18.16b, v22.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "ldr d10, [x7, #0x30]\n"
+ "ldr d2, [x7, #0x38]\n"
+ "usubl v15.8h, v15.8b, v16.8b\n"
+ "usubl v13.8h, v13.8b, v16.8b\n"
+ "ldr d3, [x7, #0x40]\n"
+ "ldp x27, x26, [x6, #0x0]\n"
+ "usubl v28.8h, v28.8b, v16.8b\n"
+ "usubl v11.8h, v11.8b, v16.8b\n"
+ "usubl v23.8h, v23.8b, v16.8b\n"
+ "usubl v17.8h, v17.8b, v16.8b\n"
+ "usubl v10.8h, v10.8b, v16.8b\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "ldp x25, x24, [x6, #0x10]\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "add x27, x27, x4\n"
+ "add x26, x26, x4\n"
+ "ldp x23, x22, [x6, #0x20]\n"
+ "add x25, x25, x4\n"
+ "add x24, x24, x4\n"
+ "ldp x21, x20, [x6, #0x30]\n"
+ "add x23, x23, x4\n"
+ "add x22, x22, x4\n"
+ "add x21, x21, x4\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 9f\n"
+ "ld1 { v29.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x26], #0x4\n"
+ "ld1 { v0.s }[0], [x25], #0x4\n"
+ "ld1 { v7.s }[0], [x24], #0x4\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 8f\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x26], #0x2\n"
+ "ld1 { v0.h }[2], [x25], #0x2\n"
+ "ld1 { v7.h }[2], [x24], #0x2\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x26]\n"
+ "ld1 { v0.b }[6], [x25]\n"
+ "ld1 { v7.b }[6], [x24]\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v26.b }[4], [x27]\n"
- "ld1 { v18.b }[4], [x26]\n"
- "ld1 { v10.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v17.b }[4], [x23]\n"
- "ld1 { v19.b }[4], [x22]\n"
- "ld1 { v15.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x26]\n"
+ "ld1 { v0.b }[4], [x25]\n"
+ "ld1 { v7.b }[4], [x24]\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v26.h }[0], [x27], #0x2\n"
- "ld1 { v18.h }[0], [x26], #0x2\n"
- "ld1 { v10.h }[0], [x25], #0x2\n"
- "ld1 { v27.h }[0], [x24], #0x2\n"
- "ld1 { v17.h }[0], [x23], #0x2\n"
- "ld1 { v19.h }[0], [x22], #0x2\n"
- "ld1 { v15.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v26.b }[2], [x27]\n"
- "ld1 { v18.b }[2], [x26]\n"
- "ld1 { v10.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v17.b }[2], [x23]\n"
- "ld1 { v19.b }[2], [x22]\n"
- "ld1 { v15.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x3, #1, 10f\n"
+ "ld1 { v29.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x26], #0x2\n"
+ "ld1 { v0.h }[0], [x25], #0x2\n"
+ "ld1 { v7.h }[0], [x24], #0x2\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x26]\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v7.b }[2], [x24]\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v26.b }[0], [x27]\n"
- "ld1 { v18.b }[0], [x26]\n"
- "ld1 { v10.b }[0], [x25]\n"
- "ld1 { v27.b }[0], [x24]\n"
- "ld1 { v17.b }[0], [x23]\n"
- "ld1 { v19.b }[0], [x22]\n"
- "ld1 { v15.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x3, #0, 11f\n"
+ "ld1 { v29.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x26]\n"
+ "ld1 { v0.b }[0], [x25]\n"
+ "ld1 { v7.b }[0], [x24]\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v1.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v26.8h, v26.8b, #0x0\n"
- "smlal v8.4s, v26.4h, v30.4h\n"
- "smlal2 v2.4s, v26.8h, v30.8h\n"
- "ldr x20, [x15, #0x40]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "smlal v8.4s, v18.4h, v12.4h\n"
- "smlal2 v2.4s, v18.8h, v12.8h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal v21.4s, v26.4h, v3.4h\n"
- "smlal2 v4.4s, v26.8h, v3.8h\n"
- "add x20, x20, x17\n"
- "smlal v8.4s, v10.4h, v11.4h\n"
- "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x20, [x6, #0x40]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ushll v7.8h, v7.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal v21.4s, v27.4h, v11.4h\n"
- "smlal2 v4.4s, v27.8h, v11.8h\n"
- "smlal v8.4s, v19.4h, v24.4h\n"
- "smlal2 v2.4s, v19.8h, v24.8h\n"
- "ushll v17.8h, v17.8b, #0x0\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal v21.4s, v17.4h, v25.4h\n"
- "smlal2 v4.4s, v17.8h, v25.8h\n"
- "smlal v8.4s, v15.4h, v23.4h\n"
- "smlal2 v2.4s, v15.8h, v23.8h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v20.4s, v26.4h, v25.4h\n"
- "smlal2 v1.4s, v26.8h, v25.8h\n"
- "smlal v16.4s, v26.4h, v12.4h\n"
- "smlal2 v14.4s, v26.8h, v12.8h\n"
- "smlal v8.4s, v28.4h, v25.4h\n"
- "smlal2 v2.4s, v28.8h, v25.8h\n"
- "smlal v21.4s, v28.4h, v12.4h\n"
- "smlal2 v4.4s, v28.8h, v12.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v22.4s, v29.4h, v3.4h\n"
+ "smlal2 v8.4s, v29.8h, v3.8h\n"
+ "smlal v9.4s, v29.4h, v10.4h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "add x20, x20, x4\n"
+ "smlal2 v31.4s, v29.8h, v10.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v21.4s, v29.8h, v28.8h\n"
+ "smlal v18.4s, v29.4h, v15.4h\n"
+ "smlal v22.4s, v25.4h, v15.4h\n"
+ "smlal2 v5.4s, v29.8h, v15.8h\n"
+ "smlal2 v8.4s, v25.8h, v15.8h\n"
+ "smlal v9.4s, v7.4h, v13.4h\n"
+ "smlal2 v31.4s, v7.8h, v13.8h\n"
+ "smlal v22.4s, v0.4h, v13.4h\n"
+ "smlal2 v8.4s, v0.8h, v13.8h\n"
+ "smlal v9.4s, v24.4h, v28.4h\n"
+ "smlal v22.4s, v27.4h, v11.4h\n"
+ "smlal2 v31.4s, v24.8h, v28.8h\n"
+ "smlal2 v8.4s, v27.8h, v11.8h\n"
+ "smlal v9.4s, v1.4h, v15.4h\n"
+ "smlal v22.4s, v26.4h, v23.4h\n"
+ "smlal2 v31.4s, v1.8h, v15.8h\n"
+ "smlal2 v8.4s, v26.8h, v23.8h\n"
+ "smlal v22.4s, v1.4h, v28.4h\n"
+ "smlal2 v8.4s, v1.8h, v28.8h\n"
+ "tbz x3, #2, 13f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 12f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x3, #1, 14f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x3, #0, 15f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v31.8h, v31.8b, #0x0\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v21.4s, v31.4h, v23.4h\n"
- "smlal2 v4.4s, v31.8h, v23.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x20, [x6, #0x48]\n"
+ "smlal v9.4s, v30.4h, v23.4h\n"
+ "smlal2 v31.4s, v30.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x3, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x3, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x20, [x15, #0x50]\n"
- "smlal v21.4s, v28.4h, v7.4h\n"
- "smlal2 v4.4s, v28.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0x50]\n"
+ "smlal v9.4s, v16.4h, v17.4h\n"
+ "smlal2 v31.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 21f\n"
"ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
+ "tbz x3, #1, 20f\n"
"ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
+ "tbz x3, #0, 23f\n"
"ld1 { v27.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
+ "tbz x3, #0, 23f\n"
"ld1 { v27.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x7, #1, 22f\n"
+ "tbz x3, #1, 22f\n"
"ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
+ "tbz x3, #0, 23f\n"
"ld1 { v27.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
+ "tbz x3, #0, 23f\n"
"ld1 { v27.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
"ushll v27.8h, v27.8b, #0x0\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v8.4s, v27.4h, v7.4h\n"
- "smlal2 v2.4s, v27.8h, v7.8h\n"
- "smlal v21.4s, v27.4h, v24.4h\n"
- "smlal2 v4.4s, v27.8h, v24.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v0.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v0.b }[6], [x20]\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal v22.4s, v27.4h, v17.4h\n"
+ "smlal2 v8.4s, v27.8h, v17.8h\n"
+ "smlal v9.4s, v27.4h, v11.4h\n"
+ "smlal2 v31.4s, v27.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v0.b }[4], [x20]\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v0.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v0.b }[2], [x20]\n"
+ "tbz x3, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v0.b }[0], [x20]\n"
+ "tbz x3, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v0.8h, v0.8b, #0x0\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v20.4s, v0.4h, v24.4h\n"
- "smlal2 v1.4s, v0.8h, v24.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0x60]\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 29f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 28f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x3, #1, 30f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x3, #0, 31f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v8.4s, v15.4h, v3.4h\n"
- "smlal2 v2.4s, v15.8h, v3.8h\n"
- "smlal v20.4s, v15.4h, v12.4h\n"
- "smlal2 v1.4s, v15.8h, v12.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v0.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v0.b }[6], [x20]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal v22.4s, v29.4h, v10.4h\n"
+ "smlal2 v8.4s, v29.8h, v10.8h\n"
+ "smlal v20.4s, v29.4h, v15.4h\n"
+ "smlal2 v21.4s, v29.8h, v15.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v0.b }[4], [x20]\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v0.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v0.b }[2], [x20]\n"
+ "tbz x3, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v0.b }[0], [x20]\n"
+ "tbz x3, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ushll v0.8h, v0.8b, #0x0\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v20.4s, v0.4h, v23.4h\n"
- "smlal2 v1.4s, v0.8h, v23.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0x70]\n"
+ "smlal v20.4s, v16.4h, v23.4h\n"
+ "smlal2 v21.4s, v16.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 37f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 36f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "tbz x3, #1, 38f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "tbz x3, #0, 39f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ushll v6.8h, v6.8b, #0x0\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v8.4s, v6.4h, v9.4h\n"
- "smlal2 v2.4s, v6.8h, v9.8h\n"
- "smlal v20.4s, v6.4h, v11.4h\n"
- "smlal2 v1.4s, v6.8h, v11.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal v22.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v20.4s, v26.4h, v13.4h\n"
+ "smlal2 v21.4s, v26.8h, v13.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x3, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x3, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ushll v27.8h, v27.8b, #0x0\n"
- "ldr x20, [x15, #0x80]\n"
- "smlal v16.4s, v27.4h, v23.4h\n"
- "smlal2 v14.4s, v27.8h, v23.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0x80]\n"
+ "smlal v18.4s, v16.4h, v23.4h\n"
+ "smlal2 v5.4s, v16.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 45f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 44f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x3, #1, 46f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x3, #0, 47f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr x20, [x15, #0x88]\n"
- "smlal v21.4s, v10.4h, v9.4h\n"
- "smlal2 v4.4s, v10.8h, v9.8h\n"
- "smlal v16.4s, v10.4h, v11.4h\n"
- "smlal2 v14.4s, v10.8h, v11.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal v9.4s, v25.4h, v2.4h\n"
+ "smlal2 v31.4s, v25.8h, v2.8h\n"
+ "smlal v18.4s, v25.4h, v13.4h\n"
+ "smlal2 v5.4s, v25.8h, v13.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x3, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x3, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr x20, [x15, #0x90]\n"
- "smlal v16.4s, v28.4h, v7.4h\n"
- "smlal2 v14.4s, v28.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0x90]\n"
+ "smlal v18.4s, v16.4h, v17.4h\n"
+ "smlal2 v5.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 53f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 52f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v1.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v1.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x3, #1, 54f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v1.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x3, #0, 55f\n"
+ "ld1 { v1.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x15, #0x98]\n"
- "smlal v20.4s, v15.4h, v3.4h\n"
- "smlal2 v1.4s, v15.8h, v3.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 56f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ldr x20, [x6, #0x98]\n"
+ "smlal v20.4s, v1.4h, v10.4h\n"
+ "smlal2 v21.4s, v1.8h, v10.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 57f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 56f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "tbz x3, #1, 58f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "tbz x3, #0, 59f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ushll v6.8h, v6.8b, #0x0\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal v21.4s, v6.4h, v30.4h\n"
- "smlal2 v4.4s, v6.8h, v30.8h\n"
- "smlal v16.4s, v6.4h, v25.4h\n"
- "smlal2 v14.4s, v6.8h, v25.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 61f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 60f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "smlal v9.4s, v16.4h, v3.4h\n"
+ "smlal2 v31.4s, v16.8h, v3.8h\n"
+ "smlal v18.4s, v16.4h, v28.4h\n"
+ "smlal2 v5.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 61f\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 60f\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x3, #1, 62f\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x3, #0, 63f\n"
+ "ld1 { v19.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v20.4s, v23.4h, v9.4h\n"
- "smlal2 v1.4s, v23.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 65f\n"
- "ld1 { v12.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 64f\n"
- "ld1 { v12.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
- "ld1 { v12.b }[6], [x20]\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr x20, [x6, #0xa8]\n"
+ "smlal v20.4s, v19.4h, v2.4h\n"
+ "smlal2 v21.4s, v19.8h, v2.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 65f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 64f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 67f\n"
- "ld1 { v12.b }[4], [x20]\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 66f\n"
- "ld1 { v12.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
- "ld1 { v12.b }[2], [x20]\n"
+ "tbz x3, #1, 66f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 67f\n"
- "ld1 { v12.b }[0], [x20]\n"
+ "tbz x3, #0, 67f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ushll v12.8h, v12.8b, #0x0\n"
- "ldr x20, [x15, #0xb0]\n"
- "smlal v20.4s, v12.4h, v7.4h\n"
- "smlal2 v1.4s, v12.8h, v7.8h\n"
- "smlal v16.4s, v12.4h, v24.4h\n"
- "smlal2 v14.4s, v12.8h, v24.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 69f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 68f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v17.4h\n"
+ "smlal2 v21.4s, v16.8h, v17.8h\n"
+ "smlal v18.4s, v16.4h, v11.4h\n"
+ "smlal2 v5.4s, v16.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 69f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 68f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x7, #1, 70f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x3, #1, 70f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x3, #0, 71f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal v16.4s, v10.4h, v9.4h\n"
- "smlal2 v14.4s, v10.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 73f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 72f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal v18.4s, v29.4h, v2.4h\n"
+ "smlal2 v5.4s, v29.8h, v2.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 75f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x7, #1, 74f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x3, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 75f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x3, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal v20.4s, v15.4h, v30.4h\n"
- "smlal2 v1.4s, v15.8h, v30.8h\n"
- "smlal v16.4s, v15.4h, v3.4h\n"
- "smlal2 v14.4s, v15.8h, v3.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 77f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 76f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x6, #0xc0]\n"
+ "smlal v20.4s, v16.4h, v3.4h\n"
+ "smlal2 v21.4s, v16.8h, v3.8h\n"
+ "smlal v18.4s, v16.4h, v10.4h\n"
+ "smlal2 v5.4s, v16.8h, v10.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x3, #2, 77f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x3, #1, 76f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x7, #1, 78f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x3, #1, 78f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x3, #0, 79f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v16.4s, v28.4h, v30.4h\n"
- "smlal2 v14.4s, v28.8h, v30.8h\n"
- "tbz x7, #2, 81f\n"
- "ld1 { v19.4s }, [x13], #0x10\n"
- "ld1 { v23.4s }, [x12], #0x10\n"
- "tbz x7, #1, 80f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v24.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v24.s }[2], [x12]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v18.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v17.8h, v3.8h\n"
+ "tbz x3, #2, 81f\n"
+ "ld1 { v16.4s }, [x8], #0x10\n"
+ "ld1 { v23.4s }, [x17], #0x10\n"
+ "tbz x3, #1, 80f\n"
+ "ld1 { v26.d }[0], [x8], #0x8\n"
+ "ld1 { v2.d }[0], [x17], #0x8\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v26.s }[2], [x8]\n"
+ "ld1 { v2.s }[2], [x17]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v24.s }[0], [x12]\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v26.s }[0], [x8]\n"
+ "ld1 { v2.s }[0], [x17]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 82f\n"
- "ld1 { v19.d }[0], [x13], #0x8\n"
- "ld1 { v23.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v19.s }[2], [x13]\n"
- "ld1 { v23.s }[2], [x12]\n"
+ "tbz x3, #1, 82f\n"
+ "ld1 { v16.d }[0], [x8], #0x8\n"
+ "ld1 { v23.d }[0], [x17], #0x8\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v16.s }[2], [x8]\n"
+ "ld1 { v23.s }[2], [x17]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v19.s }[0], [x13]\n"
- "ld1 { v23.s }[0], [x12]\n"
+ "tbz x3, #0, 83f\n"
+ "ld1 { v16.s }[0], [x8]\n"
+ "ld1 { v23.s }[0], [x17]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v8.4s, v8.4s, v19.4s\n"
- "and v17.16b, v8.16b, v23.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v16.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v26.4s\n"
+ "add x15, x15, x5\n"
+ "add x14, x14, x5\n"
+ "sqrdmulh v9.4s, v9.4s, v16.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v16.4s\n"
+ "add x13, x13, x5\n"
+ "add x12, x12, x5\n"
+ "sqrdmulh v18.4s, v18.4s, v16.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v26.4s\n"
+ "and v17.16b, v22.16b, v23.16b\n"
+ "and v16.16b, v8.16b, v2.16b\n"
+ "and v19.16b, v9.16b, v23.16b\n"
+ "and v28.16b, v20.16b, v23.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v26.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v11.16b, v2.16b, v24.16b\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqrdmulh v20.4s, v20.4s, v19.4s\n"
- "sqrdmulh v16.4s, v16.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v17.4s\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v28.16b, v21.16b, v23.16b\n"
- "sqrdmulh v4.4s, v4.4s, v18.4s\n"
- "and v17.16b, v20.16b, v23.16b\n"
- "sqrdmulh v1.4s, v1.4s, v18.4s\n"
- "and v19.16b, v16.16b, v23.16b\n"
- "sqrdmulh v14.4s, v14.4s, v18.4s\n"
- "sqadd v2.4s, v2.4s, v11.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v31.16b, v2.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "and v18.16b, v4.16b, v24.16b\n"
+ "and v0.16b, v21.16b, v2.16b\n"
+ "sqadd v22.4s, v22.4s, v17.4s\n"
+ "and v17.16b, v18.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v16.4s\n"
+ "and v16.16b, v5.16b, v2.16b\n"
+ "sqadd v9.4s, v9.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "and v12.16b, v1.16b, v24.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "and v25.16b, v14.16b, v24.16b\n"
- "sqadd v21.4s, v21.4s, v28.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v17.4s\n"
- "sshr v12.4s, v12.4s, #0x1f\n"
- "sqadd v16.4s, v16.4s, v19.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "srshl v8.4s, v8.4s, v23.4s\n"
- "srshl v21.4s, v21.4s, v23.4s\n"
- "sqadd v4.4s, v4.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v28.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v9.4s, v9.4s, v23.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
"srshl v20.4s, v20.4s, v23.4s\n"
- "sqadd v1.4s, v1.4s, v12.4s\n"
- "srshl v16.4s, v16.4s, v23.4s\n"
- "sqadd v14.4s, v14.4s, v25.4s\n"
- "srshl v2.4s, v2.4s, v24.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "srshl v4.4s, v4.4s, v24.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "srshl v8.4s, v8.4s, v2.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v31.4s, v31.4s, v2.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v2.4s\n"
"sqxtn v20.4h, v20.4s\n"
- "srshl v14.4s, v14.4s, v24.4s\n"
- "sqxtn v16.4h, v16.4s\n"
- "sqxtn2 v8.8h, v2.4s\n"
- "sqxtn2 v21.8h, v4.4s\n"
- "sqxtn2 v20.8h, v1.4s\n"
- "sqxtn2 v16.8h, v14.4s\n"
- "sqadd v8.8h, v8.8h, v22.8h\n"
- "sqadd v21.8h, v21.8h, v22.8h\n"
- "sqadd v20.8h, v20.8h, v22.8h\n"
- "sqadd v16.8h, v16.8h, v22.8h\n"
- "smax v8.8h, v8.8h, v13.8h\n"
- "smax v21.8h, v21.8h, v13.8h\n"
- "smax v20.8h, v20.8h, v13.8h\n"
- "smax v16.8h, v16.8h, v13.8h\n"
- "smin v8.8h, v8.8h, v5.8h\n"
- "smin v21.8h, v21.8h, v5.8h\n"
- "smin v20.8h, v20.8h, v5.8h\n"
- "smin v16.8h, v16.8h, v5.8h\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "srshl v5.4s, v5.4s, v2.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v22.8h, v8.4s\n"
+ "sqxtn2 v9.8h, v31.4s\n"
+ "sqxtn2 v20.8h, v21.4s\n"
+ "sqxtn2 v18.8h, v5.4s\n"
+ "sqadd v22.8h, v22.8h, v12.8h\n"
+ "sqadd v9.8h, v9.8h, v12.8h\n"
+ "sqadd v20.8h, v20.8h, v12.8h\n"
+ "sqadd v18.8h, v18.8h, v12.8h\n"
+ "smax v22.8h, v22.8h, v14.8h\n"
+ "smax v9.8h, v9.8h, v14.8h\n"
+ "smax v20.8h, v20.8h, v14.8h\n"
+ "smax v18.8h, v18.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v6.8h\n"
+ "smin v9.8h, v9.8h, v6.8h\n"
+ "smin v20.8h, v20.8h, v6.8h\n"
+ "smin v18.8h, v18.8h, v6.8h\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "tbz x7, #2, 85f\n"
- "st1 { v8.s }[0], [x11], #0x4\n"
- "st1 { v21.s }[0], [x10], #0x4\n"
- "st1 { v20.s }[0], [x9], #0x4\n"
- "st1 { v16.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 84f\n"
- "st1 { v8.h }[2], [x11], #0x2\n"
- "st1 { v21.h }[2], [x10], #0x2\n"
- "st1 { v20.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v8.b }[6], [x11], #0x1\n"
- "st1 { v21.b }[6], [x10], #0x1\n"
- "st1 { v20.b }[6], [x9], #0x1\n"
- "st1 { v16.b }[6], [x28], #0x1\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "tbz x3, #2, 85f\n"
+ "st1 { v22.s }[0], [x15], #0x4\n"
+ "st1 { v9.s }[0], [x14], #0x4\n"
+ "st1 { v20.s }[0], [x13], #0x4\n"
+ "st1 { v18.s }[0], [x12], #0x4\n"
+ "tbz x3, #1, 84f\n"
+ "st1 { v22.h }[2], [x15], #0x2\n"
+ "st1 { v9.h }[2], [x14], #0x2\n"
+ "st1 { v20.h }[2], [x13], #0x2\n"
+ "st1 { v18.h }[2], [x12], #0x2\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v22.b }[6], [x15], #0x1\n"
+ "st1 { v9.b }[6], [x14], #0x1\n"
+ "st1 { v20.b }[6], [x13], #0x1\n"
+ "st1 { v18.b }[6], [x12], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v8.b }[4], [x11], #0x1\n"
- "st1 { v21.b }[4], [x10], #0x1\n"
- "st1 { v20.b }[4], [x9], #0x1\n"
- "st1 { v16.b }[4], [x28], #0x1\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v22.b }[4], [x15], #0x1\n"
+ "st1 { v9.b }[4], [x14], #0x1\n"
+ "st1 { v20.b }[4], [x13], #0x1\n"
+ "st1 { v18.b }[4], [x12], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 86f\n"
- "st1 { v8.h }[0], [x11], #0x2\n"
- "st1 { v21.h }[0], [x10], #0x2\n"
- "st1 { v20.h }[0], [x9], #0x2\n"
- "st1 { v16.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v8.b }[2], [x11], #0x1\n"
- "st1 { v21.b }[2], [x10], #0x1\n"
- "st1 { v20.b }[2], [x9], #0x1\n"
- "st1 { v16.b }[2], [x28], #0x1\n"
+ "tbz x3, #1, 86f\n"
+ "st1 { v22.h }[0], [x15], #0x2\n"
+ "st1 { v9.h }[0], [x14], #0x2\n"
+ "st1 { v20.h }[0], [x13], #0x2\n"
+ "st1 { v18.h }[0], [x12], #0x2\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v22.b }[2], [x15], #0x1\n"
+ "st1 { v9.b }[2], [x14], #0x1\n"
+ "st1 { v20.b }[2], [x13], #0x1\n"
+ "st1 { v18.b }[2], [x12], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v8.b }[0], [x11], #0x1\n"
- "st1 { v21.b }[0], [x10], #0x1\n"
- "st1 { v20.b }[0], [x9], #0x1\n"
- "st1 { v16.b }[0], [x28], #0x1\n"
+ "tbz x3, #0, 87f\n"
+ "st1 { v22.b }[0], [x15], #0x1\n"
+ "st1 { v9.b }[0], [x14], #0x1\n"
+ "st1 { v20.b }[0], [x13], #0x1\n"
+ "st1 { v18.b }[0], [x12], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 9316732632..42de21c670 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -111,2071 +111,2071 @@ void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x3, x2, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v2.16b }, [x20]\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x8, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v14.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_c_offset]\n"
"add x20, x23, %[offsetof_Requantize32_minval]\n"
- "ld1r { v25.8h }, [x21]\n"
- "ld1r { v12.8h }, [x20]\n"
+ "ld1r { v15.8h }, [x21]\n"
+ "ld1r { v31.8h }, [x20]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "mov x4, #0x0\n"
- "ld1r { v26.8h }, [x20]\n"
- "mov x5, #0x0\n"
- "add x6, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x16, x15, [x22, #0x0]\n"
- "ldp x14, x13, [x22, #0x10]\n"
- "cbz x3, 3f\n"
- "ldr d21, [x7, #0x0]\n"
- "ldr d15, [x7, #0x8]\n"
- "subs x3, x3, #0x1\n"
- "usubl v21.8h, v21.8b, v2.8b\n"
- "ldr d29, [x7, #0x10]\n"
- "ldr d18, [x7, #0x18]\n"
- "usubl v15.8h, v15.8b, v2.8b\n"
- "usubl v29.8h, v29.8b, v2.8b\n"
- "ldr d3, [x7, #0x20]\n"
+ "ld1r { v28.8h }, [x20]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d6, [x5, #0x0]\n"
+ "ldr d20, [x5, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ldr d9, [x5, #0x10]\n"
+ "ldr d1, [x5, #0x18]\n"
+ "ldr d17, [x5, #0x20]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "usubl v18.8h, v18.8b, v2.8b\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "ldr q13, [x20, #0x0]\n"
+ "usubl v6.8h, v6.8b, v14.8b\n"
+ "usubl v20.8h, v20.8b, v14.8b\n"
+ "usubl v9.8h, v9.8b, v14.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldr q30, [x20, #0x0]\n"
"ldr q24, [x20, #0x10]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
"add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x6, #0x0]\n"
- "ldp x27, x26, [x6, #0x10]\n"
- "mov v7.16b, v13.16b\n"
- "mov v14.16b, v24.16b\n"
- "ldp x25, x24, [x6, #0x20]\n"
- "ldp x23, x22, [x6, #0x30]\n"
- "mov v27.16b, v13.16b\n"
- "mov v22.16b, v24.16b\n"
- "ldp x21, x20, [x6, #0x40]\n"
- "ldr d10, [x9, x4]\n"
- "mov v8.16b, v13.16b\n"
- "mov v17.16b, v24.16b\n"
- "ldr d16, [x28, x4]\n"
- "ldr d23, [x27, x4]\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "ldr d30, [x26, x4]\n"
- "ldr d4, [x25, x4]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr d28, [x24, x4]\n"
- "ldr d31, [x23, x4]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "mov v12.16b, v30.16b\n"
+ "mov v13.16b, v24.16b\n"
+ "mov v5.16b, v30.16b\n"
+ "mov v23.16b, v24.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v7.16b, v30.16b\n"
+ "mov v19.16b, v24.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d18, [x9, x2]\n"
+ "ldr d4, [x28, x2]\n"
+ "ldr d0, [x27, x2]\n"
+ "ldr d25, [x26, x2]\n"
+ "ldr d10, [x25, x2]\n"
+ "ldr d11, [x24, x2]\n"
+ "ldr d22, [x23, x2]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr d21, [x22, x2]\n"
+ "ldr d8, [x21, x2]\n"
"ushll v4.8h, v4.8b, #0x0\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ldr d1, [x22, x4]\n"
- "ldr d9, [x21, x4]\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v1.8h, v1.8b, #0x0\n"
- "ldr d11, [x20, x4]\n"
- "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ldr d26, [x20, x2]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
"ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
"beq 2f\n"
"1:" // Loop
- "ldr d5, [x7, #0x28]\n"
- "ldr d6, [x7, #0x30]\n"
- "smlal v13.4s, v10.4h, v21.4h\n"
- "smlal2 v24.4s, v10.8h, v21.8h\n"
- "ldr d19, [x7, #0x38]\n"
- "ldr d0, [x7, #0x40]\n"
- "smlal v13.4s, v16.4h, v15.4h\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "ldr d10, [x7, #0x48]\n"
- "ldr d20, [x7, #0x50]\n"
- "smlal v27.4s, v23.4h, v21.4h\n"
- "smlal v8.4s, v30.4h, v21.4h\n"
- "ldr x21, [x6, #0x50]\n"
- "smlal2 v24.4s, v16.8h, v15.8h\n"
- "smlal v13.4s, v4.4h, v29.4h\n"
- "ldr x20, [x6, #0x58]\n"
- "smlal2 v14.4s, v16.8h, v21.8h\n"
- "ldr d16, [x21, x4]\n"
- "smlal2 v22.4s, v23.8h, v21.8h\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "smlal2 v17.4s, v30.8h, v21.8h\n"
- "ldr d21, [x20, x4]\n"
- "smlal v7.4s, v4.4h, v15.4h\n"
- "ldr x22, [x6, #0x60]\n"
- "smlal v27.4s, v30.4h, v15.4h\n"
- "smlal v8.4s, v28.4h, v15.4h\n"
+ "ldr d29, [x5, #0x28]\n"
+ "ldr d2, [x5, #0x30]\n"
+ "smlal v30.4s, v18.4h, v6.4h\n"
+ "smlal2 v24.4s, v18.8h, v6.8h\n"
+ "ldr d18, [x5, #0x38]\n"
+ "ldr d27, [x5, #0x40]\n"
+ "smlal v12.4s, v4.4h, v6.4h\n"
+ "smlal v5.4s, v0.4h, v6.4h\n"
+ "ldr d16, [x5, #0x48]\n"
+ "ldr d3, [x5, #0x50]\n"
+ "smlal v7.4s, v25.4h, v6.4h\n"
+ "smlal2 v13.4s, v4.8h, v6.8h\n"
+ "ldr x23, [x4, #0x50]\n"
+ "smlal2 v23.4s, v0.8h, v6.8h\n"
+ "smlal2 v19.4s, v25.8h, v6.8h\n"
+ "ldr d6, [x5, #0x58]\n"
+ "smlal v30.4s, v4.4h, v20.4h\n"
+ "smlal2 v24.4s, v4.8h, v20.8h\n"
+ "ldr d4, [x5, #0x60]\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v12.4s, v10.4h, v20.4h\n"
+ "smlal v5.4s, v25.4h, v20.4h\n"
+ "ldr x22, [x4, #0x60]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v7.4s, v11.4h, v20.4h\n"
+ "smlal2 v13.4s, v10.8h, v20.8h\n"
+ "ldr x21, [x4, #0x68]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal2 v23.4s, v25.8h, v20.8h\n"
+ "smlal2 v19.4s, v11.8h, v20.8h\n"
+ "ldr d20, [x23, x2]\n"
+ "ldr x27, [x4, #0x70]\n"
+ "smlal v30.4s, v10.4h, v9.4h\n"
+ "smlal2 v24.4s, v10.8h, v9.8h\n"
+ "ldr d10, [x20, x2]\n"
+ "usubl v18.8h, v18.8b, v14.8b\n"
+ "smlal v12.4s, v22.4h, v9.4h\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x4, #0x78]\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal2 v13.4s, v22.8h, v9.8h\n"
+ "ldr x26, [x4, #0x80]\n"
+ "ldr x25, [x4, #0x88]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v23.4s, v11.8h, v9.8h\n"
+ "ldr x24, [x4, #0x90]\n"
+ "ldr x23, [x4, #0x98]\n"
+ "smlal v30.4s, v22.4h, v1.4h\n"
+ "smlal2 v24.4s, v22.8h, v1.8h\n"
+ "ldr d22, [x22, x2]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v20.4h, v9.4h\n"
+ "smlal2 v19.4s, v20.8h, v9.8h\n"
+ "ldr d9, [x21, x2]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v12.4s, v21.4h, v1.4h\n"
+ "smlal v5.4s, v20.4h, v1.4h\n"
+ "usubl v6.8h, v6.8b, v14.8b\n"
+ "ldr x22, [x4, #0xa0]\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v13.4s, v21.8h, v1.8h\n"
+ "smlal2 v23.4s, v20.8h, v1.8h\n"
+ "ldr x21, [x4, #0xa8]\n"
+ "smlal v30.4s, v21.4h, v17.4h\n"
+ "smlal2 v24.4s, v21.8h, v17.8h\n"
+ "ldr d21, [x27, x2]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x20, x2]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v12.4s, v22.4h, v17.4h\n"
+ "smlal v5.4s, v10.4h, v17.4h\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "ldr x13, [x4, #0xb8]\n"
+ "smlal2 v13.4s, v22.8h, v17.8h\n"
+ "ldr d22, [x26, x2]\n"
+ "smlal2 v23.4s, v10.8h, v17.8h\n"
"ushll v21.8h, v21.8b, #0x0\n"
- "ldr x20, [x6, #0x68]\n"
- "smlal2 v24.4s, v4.8h, v29.8h\n"
- "smlal v13.4s, v31.4h, v18.4h\n"
- "usubl v5.8h, v5.8b, v2.8b\n"
- "ldr x21, [x6, #0x70]\n"
- "smlal2 v14.4s, v4.8h, v15.8h\n"
- "ldr d4, [x22, x4]\n"
- "smlal2 v22.4s, v30.8h, v15.8h\n"
- "ushll v4.8h, v4.8b, #0x0\n"
- "smlal2 v17.4s, v28.8h, v15.8h\n"
- "ldr d15, [x20, x4]\n"
- "smlal v7.4s, v31.4h, v29.4h\n"
- "usubl v6.8h, v6.8b, v2.8b\n"
- "smlal v27.4s, v28.4h, v29.4h\n"
- "smlal v8.4s, v16.4h, v29.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x6, #0x78]\n"
- "smlal2 v24.4s, v31.8h, v18.8h\n"
- "smlal v13.4s, v1.4h, v3.4h\n"
- "usubl v19.8h, v19.8b, v2.8b\n"
- "ldr x22, [x6, #0x80]\n"
- "smlal2 v14.4s, v31.8h, v29.8h\n"
- "ldr d31, [x21, x4]\n"
- "smlal2 v22.4s, v28.8h, v29.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v17.4s, v16.8h, v29.8h\n"
- "ldr d29, [x20, x4]\n"
- "smlal v7.4s, v1.4h, v18.4h\n"
- "usubl v0.8h, v0.8b, v2.8b\n"
- "smlal v27.4s, v16.4h, v18.4h\n"
- "smlal v8.4s, v21.4h, v18.4h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x20, [x6, #0x88]\n"
- "smlal2 v24.4s, v1.8h, v3.8h\n"
- "smlal v13.4s, v23.4h, v5.4h\n"
- "usubl v10.8h, v10.8b, v2.8b\n"
- "ldr x21, [x6, #0x90]\n"
- "smlal2 v14.4s, v1.8h, v18.8h\n"
- "ldr d1, [x22, x4]\n"
- "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "smlal v30.4s, v0.4h, v29.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d0, [x25, x2]\n"
"ushll v1.8h, v1.8b, #0x0\n"
- "smlal2 v17.4s, v21.8h, v18.8h\n"
- "ldr d18, [x20, x4]\n"
- "smlal v7.4s, v4.4h, v3.4h\n"
- "usubl v20.8h, v20.8b, v2.8b\n"
- "smlal v27.4s, v21.4h, v3.4h\n"
- "smlal v8.4s, v9.4h, v3.4h\n"
- "ldr x20, [x6, #0x98]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "smlal2 v24.4s, v23.8h, v5.8h\n"
- "ldr d23, [x7, #0x58]\n"
- "smlal v13.4s, v30.4h, v6.4h\n"
- "usubl v23.8h, v23.8b, v2.8b\n"
- "smlal2 v14.4s, v4.8h, v3.8h\n"
- "ldr d4, [x21, x4]\n"
- "smlal2 v22.4s, v21.8h, v3.8h\n"
- "ldr x23, [x6, #0xa0]\n"
- "smlal2 v17.4s, v9.8h, v3.8h\n"
- "ldr d3, [x20, x4]\n"
- "smlal v7.4s, v30.4h, v5.4h\n"
- "ushll v4.8h, v4.8b, #0x0\n"
- "smlal v27.4s, v11.4h, v5.4h\n"
- "smlal v8.4s, v15.4h, v5.4h\n"
- "ushll v3.8h, v3.8b, #0x0\n"
- "ldr x22, [x6, #0xa8]\n"
- "smlal2 v24.4s, v30.8h, v6.8h\n"
- "smlal v13.4s, v28.4h, v19.4h\n"
- "ldr x21, [x6, #0xb0]\n"
- "ldr x20, [x6, #0xb8]\n"
- "smlal2 v14.4s, v30.8h, v5.8h\n"
- "ldr d30, [x7, #0x60]\n"
- "smlal2 v22.4s, v11.8h, v5.8h\n"
- "usubl v30.8h, v30.8b, v2.8b\n"
- "smlal2 v17.4s, v15.8h, v5.8h\n"
- "ldr d5, [x23, x4]\n"
- "smlal v7.4s, v28.4h, v6.4h\n"
- "ushll v5.8h, v5.8b, #0x0\n"
- "smlal v27.4s, v15.4h, v6.4h\n"
- "smlal v8.4s, v31.4h, v6.4h\n"
- "ldr x12, [x6, #0xc0]\n"
- "ldr x11, [x6, #0xc8]\n"
- "smlal2 v24.4s, v28.8h, v19.8h\n"
- "smlal v13.4s, v16.4h, v0.4h\n"
- "ldr x10, [x6, #0xd0]\n"
- "ldr x9, [x6, #0xd8]\n"
- "smlal2 v14.4s, v28.8h, v6.8h\n"
- "ldr d28, [x7, #0x68]\n"
- "smlal2 v22.4s, v15.8h, v6.8h\n"
- "usubl v28.8h, v28.8b, v2.8b\n"
- "smlal2 v17.4s, v31.8h, v6.8h\n"
- "ldr d6, [x22, x4]\n"
- "smlal v7.4s, v16.4h, v19.4h\n"
- "ushll v6.8h, v6.8b, #0x0\n"
- "smlal v27.4s, v31.4h, v19.4h\n"
- "smlal v8.4s, v29.4h, v19.4h\n"
- "ldr x28, [x6, #0xe0]\n"
- "ldr x27, [x6, #0xe8]\n"
- "smlal2 v24.4s, v16.8h, v0.8h\n"
- "smlal v13.4s, v21.4h, v10.4h\n"
- "ldr x26, [x6, #0xf0]\n"
- "ldr x25, [x6, #0xf8]\n"
- "smlal2 v14.4s, v16.8h, v19.8h\n"
- "ldr d16, [x7, #0x70]\n"
- "smlal2 v22.4s, v31.8h, v19.8h\n"
- "usubl v16.8h, v16.8b, v2.8b\n"
- "smlal2 v17.4s, v29.8h, v19.8h\n"
- "ldr d19, [x21, x4]\n"
- "smlal v7.4s, v21.4h, v0.4h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal v27.4s, v29.4h, v0.4h\n"
- "smlal v8.4s, v1.4h, v0.4h\n"
- "ldr x24, [x6, #0x100]\n"
- "ldr x23, [x6, #0x108]\n"
- "smlal2 v24.4s, v21.8h, v10.8h\n"
- "smlal v13.4s, v11.4h, v20.4h\n"
- "ldr x22, [x6, #0x110]\n"
- "ldr x21, [x6, #0x118]\n"
- "smlal2 v14.4s, v21.8h, v0.8h\n"
- "ldr d21, [x7, #0x78]\n"
- "smlal2 v22.4s, v29.8h, v0.8h\n"
- "usubl v21.8h, v21.8b, v2.8b\n"
- "smlal2 v17.4s, v1.8h, v0.8h\n"
- "ldr d0, [x20, x4]\n"
- "smlal v7.4s, v9.4h, v10.4h\n"
+ "smlal v7.4s, v8.4h, v17.4h\n"
+ "smlal2 v19.4s, v8.8h, v17.8h\n"
+ "ldr d17, [x24, x2]\n"
+ "ldr x12, [x4, #0xc0]\n"
+ "smlal v12.4s, v25.4h, v29.4h\n"
+ "smlal v5.4s, v26.4h, v29.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr x11, [x4, #0xc8]\n"
+ "smlal2 v13.4s, v25.8h, v29.8h\n"
+ "smlal2 v23.4s, v26.8h, v29.8h\n"
"ushll v0.8h, v0.8b, #0x0\n"
- "smlal v27.4s, v1.4h, v10.4h\n"
- "smlal v8.4s, v18.4h, v10.4h\n"
+ "ldr x10, [x4, #0xd0]\n"
+ "smlal v30.4s, v25.4h, v2.4h\n"
+ "smlal2 v24.4s, v25.8h, v2.8h\n"
+ "ldr d25, [x23, x2]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v7.4s, v9.4h, v29.4h\n"
+ "smlal2 v19.4s, v9.8h, v29.8h\n"
+ "ldr d29, [x22, x2]\n"
+ "ldr x9, [x4, #0xd8]\n"
+ "smlal v12.4s, v11.4h, v2.4h\n"
+ "smlal v5.4s, v9.4h, v2.4h\n"
+ "ldr x28, [x4, #0xe0]\n"
+ "ldr x27, [x4, #0xe8]\n"
+ "smlal2 v13.4s, v11.8h, v2.8h\n"
+ "smlal2 v23.4s, v9.8h, v2.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x26, [x4, #0xf0]\n"
+ "smlal v30.4s, v11.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v18.8h\n"
+ "ldr d11, [x5, #0x68]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v7.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "ldr d2, [x21, x2]\n"
+ "ldr x25, [x4, #0xf8]\n"
+ "smlal v12.4s, v20.4h, v18.4h\n"
+ "smlal v5.4s, v21.4h, v18.4h\n"
+ "ldr x24, [x4, #0x100]\n"
+ "ldr x23, [x4, #0x108]\n"
+ "smlal2 v13.4s, v20.8h, v18.8h\n"
+ "smlal2 v23.4s, v21.8h, v18.8h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "ldr x22, [x4, #0x110]\n"
+ "smlal v30.4s, v20.4h, v27.4h\n"
+ "smlal2 v24.4s, v20.8h, v27.8h\n"
+ "ldr d20, [x5, #0x70]\n"
+ "ushll v2.8h, v2.8b, #0x0\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "smlal2 v19.4s, v1.8h, v18.8h\n"
+ "ldr d18, [x20, x2]\n"
+ "ldr x21, [x4, #0x118]\n"
+ "smlal v12.4s, v10.4h, v27.4h\n"
+ "smlal v5.4s, v1.4h, v27.4h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "subs x3, x3, #0x1\n"
- "smlal2 v24.4s, v11.8h, v20.8h\n"
- "ldr d11, [x7, #0x80]\n"
- "smlal v13.4s, v15.4h, v23.4h\n"
- "usubl v11.8h, v11.8b, v2.8b\n"
- "smlal2 v14.4s, v9.8h, v10.8h\n"
- "ldr d9, [x12, x4]\n"
- "smlal2 v22.4s, v1.8h, v10.8h\n"
- "ushll v9.8h, v9.8b, #0x0\n"
- "smlal2 v17.4s, v18.8h, v10.8h\n"
- "ldr d10, [x11, x4]\n"
- "smlal v7.4s, v15.4h, v20.4h\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "smlal v27.4s, v4.4h, v20.4h\n"
- "smlal v8.4s, v3.4h, v20.4h\n"
- "smlal2 v24.4s, v15.8h, v23.8h\n"
- "smlal v13.4s, v31.4h, v30.4h\n"
- "smlal2 v14.4s, v15.8h, v20.8h\n"
- "ldr d15, [x7, #0x88]\n"
- "smlal2 v22.4s, v4.8h, v20.8h\n"
- "usubl v15.8h, v15.8b, v2.8b\n"
- "smlal2 v17.4s, v3.8h, v20.8h\n"
- "ldr d20, [x10, x4]\n"
- "smlal v7.4s, v31.4h, v23.4h\n"
- "ushll v20.8h, v20.8b, #0x0\n"
- "smlal v27.4s, v3.4h, v23.4h\n"
- "smlal v8.4s, v5.4h, v23.4h\n"
- "smlal2 v24.4s, v31.8h, v30.8h\n"
- "smlal v13.4s, v29.4h, v28.4h\n"
- "smlal2 v14.4s, v31.8h, v23.8h\n"
- "ldr d31, [x7, #0x90]\n"
- "smlal2 v22.4s, v3.8h, v23.8h\n"
- "usubl v31.8h, v31.8b, v2.8b\n"
- "smlal2 v17.4s, v5.8h, v23.8h\n"
- "ldr d23, [x9, x4]\n"
- "smlal v7.4s, v29.4h, v30.4h\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal v27.4s, v5.4h, v30.4h\n"
- "smlal v8.4s, v6.4h, v30.4h\n"
- "smlal2 v24.4s, v29.8h, v28.8h\n"
- "smlal v13.4s, v1.4h, v16.4h\n"
- "smlal2 v14.4s, v29.8h, v30.8h\n"
- "ldr d29, [x7, #0x98]\n"
- "smlal2 v22.4s, v5.8h, v30.8h\n"
- "usubl v29.8h, v29.8b, v2.8b\n"
- "smlal2 v17.4s, v6.8h, v30.8h\n"
- "ldr d30, [x28, x4]\n"
- "smlal v7.4s, v1.4h, v28.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v27.4s, v6.4h, v28.4h\n"
- "smlal v8.4s, v19.4h, v28.4h\n"
- "smlal2 v24.4s, v1.8h, v16.8h\n"
- "smlal v13.4s, v4.4h, v21.4h\n"
- "smlal2 v14.4s, v1.8h, v28.8h\n"
- "ldr d1, [x7, #0xa0]\n"
- "smlal2 v22.4s, v6.8h, v28.8h\n"
- "usubl v1.8h, v1.8b, v2.8b\n"
- "smlal2 v17.4s, v19.8h, v28.8h\n"
- "ldr d28, [x27, x4]\n"
- "smlal v7.4s, v18.4h, v16.4h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v27.4s, v19.4h, v16.4h\n"
- "smlal v8.4s, v0.4h, v16.4h\n"
- "smlal2 v24.4s, v4.8h, v21.8h\n"
- "ldr d4, [x7, #0xa8]\n"
- "smlal v13.4s, v3.4h, v11.4h\n"
- "usubl v4.8h, v4.8b, v2.8b\n"
- "smlal2 v14.4s, v18.8h, v16.8h\n"
- "ldr d18, [x26, x4]\n"
- "smlal2 v22.4s, v19.8h, v16.8h\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v13.4s, v10.8h, v27.8h\n"
+ "smlal2 v23.4s, v1.8h, v27.8h\n"
+ "usubl v20.8h, v20.8b, v14.8b\n"
+ "smlal v30.4s, v10.4h, v16.4h\n"
+ "smlal2 v24.4s, v10.8h, v16.8h\n"
+ "ldr d10, [x5, #0x78]\n"
"ushll v18.8h, v18.8b, #0x0\n"
- "smlal2 v17.4s, v0.8h, v16.8h\n"
- "ldr d16, [x25, x4]\n"
- "smlal v7.4s, v3.4h, v21.4h\n"
+ "smlal v7.4s, v22.4h, v27.4h\n"
+ "smlal2 v19.4s, v22.8h, v27.8h\n"
+ "ldr d27, [x13, x2]\n"
+ "smlal v12.4s, v8.4h, v16.4h\n"
+ "smlal v5.4s, v22.4h, v16.4h\n"
+ "smlal2 v13.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x5, #0x80]\n"
+ "smlal2 v23.4s, v22.8h, v16.8h\n"
+ "usubl v10.8h, v10.8b, v14.8b\n"
+ "smlal v30.4s, v26.4h, v3.4h\n"
+ "smlal2 v24.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x12, x2]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v7.4s, v0.4h, v16.4h\n"
+ "smlal2 v19.4s, v0.8h, v16.8h\n"
+ "ldr d16, [x11, x2]\n"
+ "smlal v12.4s, v9.4h, v3.4h\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v13.4s, v9.8h, v3.8h\n"
+ "smlal2 v23.4s, v17.8h, v3.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v30.4s, v9.4h, v6.4h\n"
+ "smlal2 v24.4s, v9.8h, v6.8h\n"
+ "ldr d9, [x5, #0x88]\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "smlal v27.4s, v9.4h, v21.4h\n"
- "smlal v8.4s, v10.4h, v21.4h\n"
- "smlal2 v24.4s, v3.8h, v11.8h\n"
- "smlal v13.4s, v5.4h, v15.4h\n"
- "smlal2 v14.4s, v3.8h, v21.8h\n"
- "ldr d3, [x7, #0xb0]\n"
- "smlal2 v22.4s, v9.8h, v21.8h\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "smlal2 v17.4s, v10.8h, v21.8h\n"
- "ldr d21, [x24, x4]\n"
- "smlal v7.4s, v5.4h, v11.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "smlal v27.4s, v10.4h, v11.4h\n"
- "smlal v8.4s, v20.4h, v11.4h\n"
- "smlal2 v24.4s, v5.8h, v15.8h\n"
- "smlal v13.4s, v6.4h, v31.4h\n"
- "smlal2 v14.4s, v5.8h, v11.8h\n"
- "ldr d5, [x7, #0xb8]\n"
- "smlal2 v22.4s, v10.8h, v11.8h\n"
- "usubl v5.8h, v5.8b, v2.8b\n"
- "smlal2 v17.4s, v20.8h, v11.8h\n"
- "ldr d11, [x23, x4]\n"
- "smlal v7.4s, v6.4h, v15.4h\n"
+ "smlal v7.4s, v25.4h, v3.4h\n"
+ "smlal2 v19.4s, v25.8h, v3.8h\n"
+ "ldr d3, [x10, x2]\n"
+ "smlal v12.4s, v21.4h, v6.4h\n"
+ "smlal v5.4s, v25.4h, v6.4h\n"
+ "smlal2 v13.4s, v21.8h, v6.8h\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "usubl v9.8h, v9.8b, v14.8b\n"
+ "smlal v30.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v21.8h, v4.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "smlal v7.4s, v29.4h, v6.4h\n"
+ "smlal2 v19.4s, v29.8h, v6.8h\n"
+ "ldr d6, [x9, x2]\n"
+ "smlal v12.4s, v1.4h, v4.4h\n"
+ "smlal v5.4s, v29.4h, v4.4h\n"
+ "smlal2 v13.4s, v1.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v4.8h\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v30.4s, v1.4h, v11.4h\n"
+ "smlal2 v24.4s, v1.8h, v11.8h\n"
+ "ldr d1, [x5, #0x98]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v7.4s, v2.4h, v4.4h\n"
+ "smlal2 v19.4s, v2.8h, v4.8h\n"
+ "ldr d4, [x28, x2]\n"
+ "smlal v12.4s, v22.4h, v11.4h\n"
+ "smlal v5.4s, v2.4h, v11.4h\n"
+ "smlal2 v13.4s, v22.8h, v11.8h\n"
+ "smlal2 v23.4s, v2.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v30.4s, v22.4h, v20.4h\n"
+ "smlal2 v24.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x5, #0xa0]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v7.4s, v18.4h, v11.4h\n"
+ "smlal2 v19.4s, v18.8h, v11.8h\n"
+ "ldr d11, [x27, x2]\n"
+ "smlal v12.4s, v0.4h, v20.4h\n"
+ "smlal v5.4s, v18.4h, v20.4h\n"
+ "smlal2 v13.4s, v0.8h, v20.8h\n"
+ "ldr d0, [x5, #0xa8]\n"
+ "smlal2 v23.4s, v18.8h, v20.8h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v30.4s, v17.4h, v10.4h\n"
+ "smlal2 v24.4s, v17.8h, v10.8h\n"
+ "ldr d17, [x26, x2]\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "smlal v27.4s, v20.4h, v15.4h\n"
- "smlal v8.4s, v23.4h, v15.4h\n"
- "smlal2 v24.4s, v6.8h, v31.8h\n"
- "smlal v13.4s, v19.4h, v29.4h\n"
- "smlal2 v14.4s, v6.8h, v15.8h\n"
- "ldr d6, [x7, #0xc0]\n"
- "smlal2 v22.4s, v20.8h, v15.8h\n"
- "usubl v6.8h, v6.8b, v2.8b\n"
- "smlal2 v17.4s, v23.8h, v15.8h\n"
- "ldr d15, [x22, x4]\n"
- "smlal v7.4s, v19.4h, v31.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal v27.4s, v23.4h, v31.4h\n"
- "smlal v8.4s, v30.4h, v31.4h\n"
- "add x7, x7, #0xc8\n"
- "smlal2 v24.4s, v19.8h, v29.8h\n"
- "smlal v13.4s, v9.4h, v1.4h\n"
- "smlal2 v14.4s, v19.8h, v31.8h\n"
- "ldr d19, [x21, x4]\n"
- "smlal2 v22.4s, v23.8h, v31.8h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal2 v17.4s, v30.8h, v31.8h\n"
- "ldr q31, [x8, #0x0]\n"
- "smlal v7.4s, v0.4h, v29.4h\n"
- "add x4, x4, #0x8\n"
- "smlal v27.4s, v30.4h, v29.4h\n"
- "smlal v8.4s, v28.4h, v29.4h\n"
- "smlal2 v24.4s, v9.8h, v1.8h\n"
- "ldr q9, [x17, #0x0]\n"
- "smlal v13.4s, v10.4h, v4.4h\n"
- "smlal2 v14.4s, v0.8h, v29.8h\n"
- "ldr q0, [x8, #0x10]\n"
- "smlal2 v22.4s, v30.8h, v29.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v17.4s, v28.8h, v29.8h\n"
- "ldr q29, [x17, #0x10]\n"
- "smlal v7.4s, v10.4h, v1.4h\n"
- "add x17, x17, #0x20\n"
- "smlal v27.4s, v18.4h, v1.4h\n"
- "smlal v8.4s, v16.4h, v1.4h\n"
- "smlal2 v24.4s, v10.8h, v4.8h\n"
- "smlal v13.4s, v20.4h, v3.4h\n"
- "smlal2 v14.4s, v10.8h, v1.8h\n"
- "smlal2 v22.4s, v18.8h, v1.8h\n"
- "smlal2 v17.4s, v16.8h, v1.8h\n"
- "smlal v7.4s, v20.4h, v4.4h\n"
- "smlal v27.4s, v16.4h, v4.4h\n"
- "smlal v8.4s, v21.4h, v4.4h\n"
- "smlal2 v24.4s, v20.8h, v3.8h\n"
- "smlal v13.4s, v23.4h, v5.4h\n"
- "smlal2 v14.4s, v20.8h, v4.8h\n"
- "smlal2 v22.4s, v16.8h, v4.8h\n"
- "smlal2 v17.4s, v21.8h, v4.8h\n"
- "smlal v7.4s, v23.4h, v3.4h\n"
- "smlal v27.4s, v21.4h, v3.4h\n"
- "smlal v8.4s, v11.4h, v3.4h\n"
- "smlal2 v24.4s, v23.8h, v5.8h\n"
- "smlal v13.4s, v30.4h, v6.4h\n"
- "sqrdmulh v13.4s, v13.4s, v31.4s\n"
- "smlal2 v14.4s, v23.8h, v3.8h\n"
- "smlal2 v22.4s, v21.8h, v3.8h\n"
- "and v23.16b, v13.16b, v9.16b\n"
- "smlal2 v17.4s, v11.8h, v3.8h\n"
- "smlal v7.4s, v30.4h, v5.4h\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "smlal v27.4s, v11.4h, v5.4h\n"
- "smlal v8.4s, v15.4h, v5.4h\n"
- "sqadd v13.4s, v13.4s, v23.4s\n"
- "smlal2 v24.4s, v30.8h, v6.8h\n"
- "smlal2 v14.4s, v30.8h, v5.8h\n"
- "sqrdmulh v24.4s, v24.4s, v0.4s\n"
- "smlal2 v22.4s, v11.8h, v5.8h\n"
- "smlal2 v17.4s, v15.8h, v5.8h\n"
- "and v10.16b, v24.16b, v29.16b\n"
- "smlal v7.4s, v28.4h, v6.4h\n"
- "smlal v27.4s, v15.4h, v6.4h\n"
- "sqrdmulh v7.4s, v7.4s, v31.4s\n"
- "smlal v8.4s, v19.4h, v6.4h\n"
- "smlal2 v14.4s, v28.8h, v6.8h\n"
- "sqrdmulh v27.4s, v27.4s, v31.4s\n"
- "smlal2 v22.4s, v15.8h, v6.8h\n"
- "smlal2 v17.4s, v19.8h, v6.8h\n"
- "sqrdmulh v8.4s, v8.4s, v31.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "and v28.16b, v7.16b, v9.16b\n"
- "sqrdmulh v14.4s, v14.4s, v0.4s\n"
- "and v20.16b, v27.16b, v9.16b\n"
- "sqrdmulh v22.4s, v22.4s, v0.4s\n"
- "and v23.16b, v8.16b, v9.16b\n"
- "sqrdmulh v17.4s, v17.4s, v0.4s\n"
- "sqadd v24.4s, v24.4s, v10.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v18.16b, v14.16b, v29.16b\n"
+ "smlal v7.4s, v27.4h, v20.4h\n"
+ "smlal2 v19.4s, v27.8h, v20.8h\n"
+ "ldr d20, [x25, x2]\n"
+ "smlal v12.4s, v25.4h, v10.4h\n"
+ "smlal v5.4s, v26.4h, v10.4h\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal2 v13.4s, v25.8h, v10.8h\n"
+ "smlal2 v23.4s, v26.8h, v10.8h\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v30.4s, v25.4h, v8.4h\n"
+ "smlal2 v24.4s, v25.8h, v8.8h\n"
+ "ldr d25, [x5, #0xb0]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v7.4s, v16.4h, v10.4h\n"
+ "smlal2 v19.4s, v16.8h, v10.8h\n"
+ "ldr d10, [x24, x2]\n"
+ "smlal v12.4s, v29.4h, v8.4h\n"
+ "smlal v5.4s, v16.4h, v8.4h\n"
+ "smlal2 v13.4s, v29.8h, v8.8h\n"
+ "smlal2 v23.4s, v16.8h, v8.8h\n"
+ "usubl v25.8h, v25.8b, v14.8b\n"
+ "smlal v30.4s, v29.4h, v9.4h\n"
+ "smlal2 v24.4s, v29.8h, v9.8h\n"
+ "ldr d29, [x5, #0xb8]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v19.4s, v3.8h, v8.8h\n"
+ "ldr d8, [x23, x2]\n"
+ "smlal v12.4s, v2.4h, v9.4h\n"
+ "smlal v5.4s, v3.4h, v9.4h\n"
+ "smlal2 v13.4s, v2.8h, v9.8h\n"
+ "smlal2 v23.4s, v3.8h, v9.8h\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v30.4s, v2.4h, v21.4h\n"
+ "smlal2 v24.4s, v2.8h, v21.8h\n"
+ "ldr d2, [x5, #0xc0]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v7.4s, v6.4h, v9.4h\n"
+ "smlal2 v19.4s, v6.8h, v9.8h\n"
+ "ldr d9, [x22, x2]\n"
+ "add x5, x5, #0xc8\n"
+ "smlal v12.4s, v18.4h, v21.4h\n"
+ "smlal v5.4s, v6.4h, v21.4h\n"
+ "smlal2 v13.4s, v18.8h, v21.8h\n"
+ "smlal2 v23.4s, v6.8h, v21.8h\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v30.4s, v18.4h, v1.4h\n"
+ "smlal2 v24.4s, v18.8h, v1.8h\n"
+ "ldr d18, [x21, x2]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v19.4s, v4.8h, v21.8h\n"
+ "ldr q21, [x6, #0x0]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v12.4s, v27.4h, v1.4h\n"
+ "smlal v5.4s, v4.4h, v1.4h\n"
+ "smlal2 v13.4s, v27.8h, v1.8h\n"
+ "ldr q27, [x7, #0x0]\n"
+ "smlal2 v23.4s, v4.8h, v1.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v30.4s, v26.4h, v22.4h\n"
+ "smlal2 v24.4s, v26.8h, v22.8h\n"
+ "ldr q26, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v7.4s, v11.4h, v1.4h\n"
+ "smlal2 v19.4s, v11.8h, v1.8h\n"
+ "ldr q1, [x7, #0x10]\n"
+ "add x7, x7, #0x20\n"
+ "smlal v12.4s, v16.4h, v22.4h\n"
+ "smlal v5.4s, v17.4h, v22.4h\n"
+ "smlal2 v13.4s, v16.8h, v22.8h\n"
+ "smlal2 v23.4s, v17.8h, v22.8h\n"
+ "smlal v30.4s, v16.4h, v0.4h\n"
+ "smlal2 v24.4s, v16.8h, v0.8h\n"
+ "smlal v7.4s, v20.4h, v22.4h\n"
+ "smlal2 v19.4s, v20.8h, v22.8h\n"
+ "smlal v12.4s, v3.4h, v0.4h\n"
+ "smlal v5.4s, v20.4h, v0.4h\n"
+ "smlal2 v13.4s, v3.8h, v0.8h\n"
+ "smlal2 v23.4s, v20.8h, v0.8h\n"
+ "smlal v30.4s, v3.4h, v25.4h\n"
+ "smlal2 v24.4s, v3.8h, v25.8h\n"
+ "smlal v7.4s, v10.4h, v0.4h\n"
+ "smlal2 v19.4s, v10.8h, v0.8h\n"
+ "smlal v12.4s, v6.4h, v25.4h\n"
+ "smlal v5.4s, v10.4h, v25.4h\n"
+ "smlal2 v13.4s, v6.8h, v25.8h\n"
+ "smlal2 v23.4s, v10.8h, v25.8h\n"
+ "smlal v30.4s, v6.4h, v29.4h\n"
+ "smlal2 v24.4s, v6.8h, v29.8h\n"
+ "smlal v7.4s, v8.4h, v25.4h\n"
+ "smlal2 v19.4s, v8.8h, v25.8h\n"
+ "smlal v12.4s, v4.4h, v29.4h\n"
+ "smlal v5.4s, v8.4h, v29.4h\n"
+ "smlal2 v13.4s, v4.8h, v29.8h\n"
+ "smlal2 v23.4s, v8.8h, v29.8h\n"
+ "smlal v30.4s, v4.4h, v2.4h\n"
+ "smlal2 v24.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v9.4h, v29.4h\n"
+ "smlal2 v19.4s, v9.8h, v29.8h\n"
+ "smlal v12.4s, v11.4h, v2.4h\n"
+ "smlal v5.4s, v9.4h, v2.4h\n"
+ "smlal2 v13.4s, v11.8h, v2.8h\n"
+ "smlal2 v23.4s, v9.8h, v2.8h\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "smlal v7.4s, v18.4h, v2.4h\n"
+ "smlal2 v19.4s, v18.8h, v2.8h\n"
+ "and v17.16b, v30.16b, v27.16b\n"
+ "sqrdmulh v12.4s, v12.4s, v21.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v21.4s\n"
+ "and v16.16b, v24.16b, v1.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v7.4s, v7.4s, v21.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v20.16b, v12.16b, v27.16b\n"
+ "and v0.16b, v5.16b, v27.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "and v8.16b, v7.16b, v27.16b\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
"sshr v20.4s, v20.4s, #0x1f\n"
- "and v30.16b, v22.16b, v29.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v5.16b, v17.16b, v29.16b\n"
- "sqadd v7.4s, v7.4s, v28.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v20.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v23.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v9.4s\n"
- "srshl v7.4s, v7.4s, v9.4s\n"
- "sqadd v14.4s, v14.4s, v18.4s\n"
- "srshl v27.4s, v27.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v30.4s\n"
- "srshl v8.4s, v8.4s, v9.4s\n"
- "sqadd v17.4s, v17.4s, v5.4s\n"
- "srshl v24.4s, v24.4s, v29.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v14.4s, v14.4s, v29.4s\n"
+ "and v22.16b, v13.16b, v1.16b\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "and v17.16b, v23.16b, v1.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v16.16b, v19.16b, v1.16b\n"
+ "sqadd v12.4s, v12.4s, v20.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v7.4s, v7.4s, v8.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v27.4s\n"
+ "srshl v12.4s, v12.4s, v27.4s\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v27.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "sqxtn v30.4h, v30.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v19.4s, v19.4s, v1.4s\n"
"sqxtn v7.4h, v7.4s\n"
- "srshl v22.4s, v22.4s, v29.4s\n"
- "sqxtn v27.4h, v27.4s\n"
- "srshl v17.4s, v17.4s, v29.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "sqxtn2 v13.8h, v24.4s\n"
- "sqxtn2 v7.8h, v14.4s\n"
- "sqxtn2 v27.8h, v22.4s\n"
- "sqxtn2 v8.8h, v17.4s\n"
- "sqadd v13.8h, v13.8h, v25.8h\n"
- "sqadd v7.8h, v7.8h, v25.8h\n"
- "sqadd v27.8h, v27.8h, v25.8h\n"
- "sqadd v8.8h, v8.8h, v25.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smax v27.8h, v27.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v26.8h\n"
- "smin v7.8h, v7.8h, v26.8h\n"
- "smin v27.8h, v27.8h, v26.8h\n"
- "smin v8.8h, v8.8h, v26.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x16, x5]\n"
+ "sqxtn2 v30.8h, v24.4s\n"
+ "sqxtn2 v12.8h, v13.4s\n"
+ "sqxtn2 v5.8h, v23.4s\n"
+ "sqxtn2 v7.8h, v19.4s\n"
+ "sqadd v30.8h, v30.8h, v15.8h\n"
+ "sqadd v12.8h, v12.8h, v15.8h\n"
+ "sqadd v5.8h, v5.8h, v15.8h\n"
+ "sqadd v7.8h, v7.8h, v15.8h\n"
+ "smax v30.8h, v30.8h, v31.8h\n"
+ "smax v12.8h, v12.8h, v31.8h\n"
+ "smax v5.8h, v5.8h, v31.8h\n"
+ "smax v7.8h, v7.8h, v31.8h\n"
+ "smin v30.8h, v30.8h, v28.8h\n"
+ "smin v12.8h, v12.8h, v28.8h\n"
+ "smin v5.8h, v5.8h, v28.8h\n"
+ "smin v7.8h, v7.8h, v28.8h\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str d7, [x15, x5]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d27, [x14, x5]\n"
- "str d8, [x13, x5]\n"
- "ldr q13, [x20, #0x0]\n"
+ "str d30, [x17, x3]\n"
+ "str d12, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "str d7, [x14, x3]\n"
+ "add x3, x3, #0x8\n"
+ "ldr q30, [x20, #0x0]\n"
"ldr q24, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d21, [x7, #0x0]\n"
- "ldr d15, [x7, #0x8]\n"
- "add x5, x5, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d29, [x7, #0x10]\n"
- "ldr d18, [x7, #0x18]\n"
- "mov v7.16b, v13.16b\n"
- "mov v14.16b, v24.16b\n"
- "ldr d3, [x7, #0x20]\n"
- "ldp x9, x28, [x6, #0x0]\n"
- "mov v27.16b, v13.16b\n"
- "mov v22.16b, v24.16b\n"
- "ldp x27, x26, [x6, #0x10]\n"
- "ldp x25, x24, [x6, #0x20]\n"
- "mov v8.16b, v13.16b\n"
- "mov v17.16b, v24.16b\n"
- "ldp x23, x22, [x6, #0x30]\n"
- "ldp x21, x20, [x6, #0x40]\n"
- "usubl v21.8h, v21.8b, v2.8b\n"
- "usubl v15.8h, v15.8b, v2.8b\n"
- "ldr d10, [x9, x4]\n"
- "ldr d16, [x28, x4]\n"
- "usubl v29.8h, v29.8b, v2.8b\n"
- "usubl v18.8h, v18.8b, v2.8b\n"
- "ldr d23, [x27, x4]\n"
- "ldr d30, [x26, x4]\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr d4, [x25, x4]\n"
- "ldr d28, [x24, x4]\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "ldr d31, [x23, x4]\n"
- "ldr d1, [x22, x4]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr d6, [x5, #0x0]\n"
+ "ldr d20, [x5, #0x8]\n"
+ "ldr d9, [x5, #0x10]\n"
+ "ldr d1, [x5, #0x18]\n"
+ "ldr d17, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v12.16b, v30.16b\n"
+ "mov v13.16b, v24.16b\n"
+ "mov v5.16b, v30.16b\n"
+ "mov v23.16b, v24.16b\n"
+ "mov v7.16b, v30.16b\n"
+ "mov v19.16b, v24.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "usubl v6.8h, v6.8b, v14.8b\n"
+ "usubl v20.8h, v20.8b, v14.8b\n"
+ "usubl v9.8h, v9.8b, v14.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d18, [x9, x2]\n"
+ "ldr d4, [x28, x2]\n"
+ "ldr d0, [x27, x2]\n"
+ "ldr d25, [x26, x2]\n"
+ "ldr d10, [x25, x2]\n"
+ "ldr d11, [x24, x2]\n"
+ "ldr d22, [x23, x2]\n"
+ "ldr d21, [x22, x2]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v4.8h, v4.8b, #0x0\n"
- "ldr d9, [x21, x4]\n"
- "ldr d11, [x20, x4]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "ushll v1.8h, v1.8b, #0x0\n"
- "ushll v9.8h, v9.8b, #0x0\n"
+ "ldr d8, [x21, x2]\n"
+ "ldr d26, [x20, x2]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
"ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
"bgt 1b\n"
"2:" // Tail
- "ldr d0, [x7, #0x28]\n"
- "ldr d20, [x7, #0x30]\n"
- "smlal v13.4s, v10.4h, v21.4h\n"
- "smlal2 v24.4s, v10.8h, v21.8h\n"
- "ldr d6, [x7, #0x38]\n"
- "ldr d19, [x7, #0x40]\n"
- "smlal v13.4s, v16.4h, v15.4h\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "ldr d10, [x7, #0x48]\n"
- "ldr d5, [x7, #0x50]\n"
- "smlal v27.4s, v23.4h, v21.4h\n"
- "smlal v8.4s, v30.4h, v21.4h\n"
- "ldr x21, [x6, #0x50]\n"
- "smlal2 v24.4s, v16.8h, v15.8h\n"
- "smlal v13.4s, v4.4h, v29.4h\n"
- "ldr x20, [x6, #0x58]\n"
- "smlal2 v14.4s, v16.8h, v21.8h\n"
- "ldr d16, [x21, x4]\n"
- "smlal2 v22.4s, v23.8h, v21.8h\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "smlal2 v17.4s, v30.8h, v21.8h\n"
- "ldr d21, [x20, x4]\n"
- "smlal v7.4s, v4.4h, v15.4h\n"
- "ldr x22, [x6, #0x60]\n"
- "smlal v27.4s, v30.4h, v15.4h\n"
- "smlal v8.4s, v28.4h, v15.4h\n"
+ "ldr d3, [x5, #0x28]\n"
+ "ldr d27, [x5, #0x30]\n"
+ "smlal v30.4s, v18.4h, v6.4h\n"
+ "smlal2 v24.4s, v18.8h, v6.8h\n"
+ "ldr d16, [x5, #0x38]\n"
+ "ldr d18, [x5, #0x40]\n"
+ "smlal v12.4s, v4.4h, v6.4h\n"
+ "smlal v5.4s, v0.4h, v6.4h\n"
+ "ldr d2, [x5, #0x48]\n"
+ "ldr d29, [x5, #0x50]\n"
+ "smlal v7.4s, v25.4h, v6.4h\n"
+ "smlal2 v13.4s, v4.8h, v6.8h\n"
+ "ldr x23, [x4, #0x50]\n"
+ "smlal2 v23.4s, v0.8h, v6.8h\n"
+ "smlal2 v19.4s, v25.8h, v6.8h\n"
+ "ldr d6, [x5, #0x58]\n"
+ "smlal v30.4s, v4.4h, v20.4h\n"
+ "smlal2 v24.4s, v4.8h, v20.8h\n"
+ "ldr d4, [x5, #0x60]\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v12.4s, v10.4h, v20.4h\n"
+ "smlal v5.4s, v25.4h, v20.4h\n"
+ "ldr x22, [x4, #0x60]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v7.4s, v11.4h, v20.4h\n"
+ "smlal2 v13.4s, v10.8h, v20.8h\n"
+ "ldr x21, [x4, #0x68]\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal2 v23.4s, v25.8h, v20.8h\n"
+ "smlal2 v19.4s, v11.8h, v20.8h\n"
+ "ldr d20, [x23, x2]\n"
+ "ldr x27, [x4, #0x70]\n"
+ "smlal v30.4s, v10.4h, v9.4h\n"
+ "smlal2 v24.4s, v10.8h, v9.8h\n"
+ "ldr d10, [x20, x2]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v12.4s, v22.4h, v9.4h\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x4, #0x78]\n"
+ "usubl v18.8h, v18.8b, v14.8b\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal2 v13.4s, v22.8h, v9.8h\n"
+ "ldr x26, [x4, #0x80]\n"
+ "ldr x25, [x4, #0x88]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v23.4s, v11.8h, v9.8h\n"
+ "ldr x24, [x4, #0x90]\n"
+ "ldr x23, [x4, #0x98]\n"
+ "smlal v30.4s, v22.4h, v1.4h\n"
+ "smlal2 v24.4s, v22.8h, v1.8h\n"
+ "ldr d22, [x22, x2]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v7.4s, v20.4h, v9.4h\n"
+ "smlal2 v19.4s, v20.8h, v9.8h\n"
+ "ldr d9, [x21, x2]\n"
+ "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal v12.4s, v21.4h, v1.4h\n"
+ "smlal v5.4s, v20.4h, v1.4h\n"
+ "usubl v6.8h, v6.8b, v14.8b\n"
+ "ldr x22, [x4, #0xa0]\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v13.4s, v21.8h, v1.8h\n"
+ "smlal2 v23.4s, v20.8h, v1.8h\n"
+ "ldr x21, [x4, #0xa8]\n"
+ "smlal v30.4s, v21.4h, v17.4h\n"
+ "smlal2 v24.4s, v21.8h, v17.8h\n"
+ "ldr d21, [x27, x2]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x20, x2]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v12.4s, v22.4h, v17.4h\n"
+ "smlal v5.4s, v10.4h, v17.4h\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "ldr x12, [x4, #0xb8]\n"
+ "smlal2 v13.4s, v22.8h, v17.8h\n"
+ "ldr d22, [x26, x2]\n"
+ "smlal2 v23.4s, v10.8h, v17.8h\n"
"ushll v21.8h, v21.8b, #0x0\n"
- "ldr x20, [x6, #0x68]\n"
- "smlal2 v24.4s, v4.8h, v29.8h\n"
- "smlal v13.4s, v31.4h, v18.4h\n"
- "usubl v0.8h, v0.8b, v2.8b\n"
- "ldr x21, [x6, #0x70]\n"
- "smlal2 v14.4s, v4.8h, v15.8h\n"
- "ldr d4, [x22, x4]\n"
- "smlal2 v22.4s, v30.8h, v15.8h\n"
- "ushll v4.8h, v4.8b, #0x0\n"
- "smlal2 v17.4s, v28.8h, v15.8h\n"
- "ldr d15, [x20, x4]\n"
- "smlal v7.4s, v31.4h, v29.4h\n"
- "usubl v20.8h, v20.8b, v2.8b\n"
- "smlal v27.4s, v28.4h, v29.4h\n"
- "smlal v8.4s, v16.4h, v29.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "ldr x20, [x6, #0x78]\n"
- "smlal2 v24.4s, v31.8h, v18.8h\n"
- "smlal v13.4s, v1.4h, v3.4h\n"
- "usubl v6.8h, v6.8b, v2.8b\n"
- "ldr x22, [x6, #0x80]\n"
- "smlal2 v14.4s, v31.8h, v29.8h\n"
- "ldr d31, [x21, x4]\n"
- "smlal2 v22.4s, v28.8h, v29.8h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v17.4s, v16.8h, v29.8h\n"
- "ldr d29, [x20, x4]\n"
- "smlal v7.4s, v1.4h, v18.4h\n"
- "usubl v19.8h, v19.8b, v2.8b\n"
- "smlal v27.4s, v16.4h, v18.4h\n"
- "smlal v8.4s, v21.4h, v18.4h\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "ldr x20, [x6, #0x88]\n"
- "smlal2 v24.4s, v1.8h, v3.8h\n"
- "smlal v13.4s, v23.4h, v0.4h\n"
- "usubl v10.8h, v10.8b, v2.8b\n"
- "ldr x21, [x6, #0x90]\n"
- "smlal2 v14.4s, v1.8h, v18.8h\n"
- "ldr d1, [x22, x4]\n"
- "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "smlal v30.4s, v0.4h, v3.4h\n"
+ "smlal2 v24.4s, v0.8h, v3.8h\n"
+ "ldr d0, [x25, x2]\n"
"ushll v1.8h, v1.8b, #0x0\n"
- "smlal2 v17.4s, v21.8h, v18.8h\n"
- "ldr d18, [x20, x4]\n"
- "smlal v7.4s, v4.4h, v3.4h\n"
- "usubl v5.8h, v5.8b, v2.8b\n"
- "smlal v27.4s, v21.4h, v3.4h\n"
- "smlal v8.4s, v9.4h, v3.4h\n"
- "ldr x20, [x6, #0x98]\n"
+ "smlal v7.4s, v8.4h, v17.4h\n"
+ "smlal2 v19.4s, v8.8h, v17.8h\n"
+ "ldr d17, [x24, x2]\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "smlal v12.4s, v25.4h, v3.4h\n"
+ "smlal v5.4s, v26.4h, v3.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr x10, [x4, #0xc8]\n"
+ "smlal2 v13.4s, v25.8h, v3.8h\n"
+ "smlal2 v23.4s, v26.8h, v3.8h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v30.4s, v25.4h, v27.4h\n"
+ "smlal2 v24.4s, v25.8h, v27.8h\n"
+ "ldr d25, [x23, x2]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v7.4s, v9.4h, v3.4h\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x22, x2]\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal v12.4s, v11.4h, v27.4h\n"
+ "smlal v5.4s, v9.4h, v27.4h\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal2 v13.4s, v11.8h, v27.8h\n"
+ "smlal2 v23.4s, v9.8h, v27.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "smlal v30.4s, v11.4h, v16.4h\n"
+ "smlal2 v24.4s, v11.8h, v16.8h\n"
+ "ldr d11, [x5, #0x68]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "smlal v7.4s, v21.4h, v27.4h\n"
+ "smlal2 v19.4s, v21.8h, v27.8h\n"
+ "ldr d27, [x21, x2]\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal v12.4s, v20.4h, v16.4h\n"
+ "smlal v5.4s, v21.4h, v16.4h\n"
+ "ldr x23, [x4, #0x100]\n"
+ "ldr x22, [x4, #0x108]\n"
+ "smlal2 v13.4s, v20.8h, v16.8h\n"
+ "smlal2 v23.4s, v21.8h, v16.8h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "ldr x21, [x4, #0x110]\n"
+ "smlal v30.4s, v20.4h, v18.4h\n"
+ "smlal2 v24.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x5, #0x70]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v7.4s, v1.4h, v16.4h\n"
+ "smlal2 v19.4s, v1.8h, v16.8h\n"
+ "ldr d16, [x20, x2]\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal v12.4s, v10.4h, v18.4h\n"
+ "smlal v5.4s, v1.4h, v18.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v13.4s, v10.8h, v18.8h\n"
+ "smlal2 v23.4s, v1.8h, v18.8h\n"
+ "usubl v20.8h, v20.8b, v14.8b\n"
+ "smlal v30.4s, v10.4h, v2.4h\n"
+ "smlal2 v24.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x5, #0x78]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v7.4s, v22.4h, v18.4h\n"
+ "smlal2 v19.4s, v22.8h, v18.8h\n"
+ "ldr d18, [x12, x2]\n"
+ "smlal v12.4s, v8.4h, v2.4h\n"
+ "smlal v5.4s, v22.4h, v2.4h\n"
+ "smlal2 v13.4s, v8.8h, v2.8h\n"
+ "ldr d8, [x5, #0x80]\n"
+ "smlal2 v23.4s, v22.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v14.8b\n"
+ "smlal v30.4s, v26.4h, v29.4h\n"
+ "smlal2 v24.4s, v26.8h, v29.8h\n"
+ "ldr d26, [x11, x2]\n"
"ushll v18.8h, v18.8b, #0x0\n"
- "smlal2 v24.4s, v23.8h, v0.8h\n"
- "ldr d23, [x7, #0x58]\n"
- "smlal v13.4s, v30.4h, v20.4h\n"
- "usubl v23.8h, v23.8b, v2.8b\n"
- "smlal2 v14.4s, v4.8h, v3.8h\n"
- "ldr d4, [x21, x4]\n"
- "smlal2 v22.4s, v21.8h, v3.8h\n"
- "ldr x22, [x6, #0xa0]\n"
- "smlal2 v17.4s, v9.8h, v3.8h\n"
- "ldr d3, [x20, x4]\n"
- "smlal v7.4s, v30.4h, v0.4h\n"
+ "smlal v7.4s, v0.4h, v2.4h\n"
+ "smlal2 v19.4s, v0.8h, v2.8h\n"
+ "ldr d2, [x10, x2]\n"
+ "smlal v12.4s, v9.4h, v29.4h\n"
+ "smlal v5.4s, v17.4h, v29.4h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v13.4s, v9.8h, v29.8h\n"
+ "smlal2 v23.4s, v17.8h, v29.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v30.4s, v9.4h, v6.4h\n"
+ "smlal2 v24.4s, v9.8h, v6.8h\n"
+ "ldr d9, [x5, #0x88]\n"
+ "ushll v2.8h, v2.8b, #0x0\n"
+ "smlal v7.4s, v25.4h, v29.4h\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x9, x2]\n"
+ "smlal v12.4s, v21.4h, v6.4h\n"
+ "smlal v5.4s, v25.4h, v6.4h\n"
+ "smlal2 v13.4s, v21.8h, v6.8h\n"
+ "smlal2 v23.4s, v25.8h, v6.8h\n"
+ "usubl v9.8h, v9.8b, v14.8b\n"
+ "smlal v30.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v21.8h, v4.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v7.4s, v3.4h, v6.4h\n"
+ "smlal2 v19.4s, v3.8h, v6.8h\n"
+ "ldr d6, [x28, x2]\n"
+ "smlal v12.4s, v1.4h, v4.4h\n"
+ "smlal v5.4s, v3.4h, v4.4h\n"
+ "smlal2 v13.4s, v1.8h, v4.8h\n"
+ "smlal2 v23.4s, v3.8h, v4.8h\n"
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v30.4s, v1.4h, v11.4h\n"
+ "smlal2 v24.4s, v1.8h, v11.8h\n"
+ "ldr d1, [x5, #0x98]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v7.4s, v27.4h, v4.4h\n"
+ "smlal2 v19.4s, v27.8h, v4.8h\n"
+ "ldr d4, [x27, x2]\n"
+ "smlal v12.4s, v22.4h, v11.4h\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v13.4s, v22.8h, v11.8h\n"
+ "smlal2 v23.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v30.4s, v22.4h, v20.4h\n"
+ "smlal2 v24.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x5, #0xa0]\n"
"ushll v4.8h, v4.8b, #0x0\n"
- "smlal v27.4s, v11.4h, v0.4h\n"
- "smlal v8.4s, v15.4h, v0.4h\n"
- "ushll v3.8h, v3.8b, #0x0\n"
- "ldr x21, [x6, #0xa8]\n"
- "smlal2 v24.4s, v30.8h, v20.8h\n"
- "smlal v13.4s, v28.4h, v6.4h\n"
- "ldr x20, [x6, #0xb0]\n"
- "ldr x12, [x6, #0xb8]\n"
- "smlal2 v14.4s, v30.8h, v0.8h\n"
- "ldr d30, [x7, #0x60]\n"
- "smlal2 v22.4s, v11.8h, v0.8h\n"
- "usubl v30.8h, v30.8b, v2.8b\n"
- "smlal2 v17.4s, v15.8h, v0.8h\n"
- "ldr d0, [x22, x4]\n"
- "smlal v7.4s, v28.4h, v20.4h\n"
- "ushll v0.8h, v0.8b, #0x0\n"
- "smlal v27.4s, v15.4h, v20.4h\n"
- "smlal v8.4s, v31.4h, v20.4h\n"
- "ldr x11, [x6, #0xc0]\n"
- "ldr x10, [x6, #0xc8]\n"
- "smlal2 v24.4s, v28.8h, v6.8h\n"
- "smlal v13.4s, v16.4h, v19.4h\n"
- "ldr x9, [x6, #0xd0]\n"
- "ldr x28, [x6, #0xd8]\n"
- "smlal2 v14.4s, v28.8h, v20.8h\n"
- "ldr d28, [x7, #0x68]\n"
- "smlal2 v22.4s, v15.8h, v20.8h\n"
- "usubl v28.8h, v28.8b, v2.8b\n"
- "smlal2 v17.4s, v31.8h, v20.8h\n"
- "ldr d20, [x21, x4]\n"
- "smlal v7.4s, v16.4h, v6.4h\n"
+ "smlal v7.4s, v16.4h, v11.4h\n"
+ "smlal2 v19.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x26, x2]\n"
+ "smlal v12.4s, v0.4h, v20.4h\n"
+ "smlal v5.4s, v16.4h, v20.4h\n"
+ "smlal2 v13.4s, v0.8h, v20.8h\n"
+ "ldr d0, [x5, #0xa8]\n"
+ "smlal2 v23.4s, v16.8h, v20.8h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v30.4s, v17.4h, v10.4h\n"
+ "smlal2 v24.4s, v17.8h, v10.8h\n"
+ "ldr d17, [x25, x2]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v7.4s, v18.4h, v20.4h\n"
+ "smlal2 v19.4s, v18.8h, v20.8h\n"
+ "ldr d20, [x24, x2]\n"
+ "smlal v12.4s, v25.4h, v10.4h\n"
+ "smlal v5.4s, v26.4h, v10.4h\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal2 v13.4s, v25.8h, v10.8h\n"
+ "smlal2 v23.4s, v26.8h, v10.8h\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v30.4s, v25.4h, v8.4h\n"
+ "smlal2 v24.4s, v25.8h, v8.8h\n"
+ "ldr d25, [x5, #0xb0]\n"
"ushll v20.8h, v20.8b, #0x0\n"
- "smlal v27.4s, v31.4h, v6.4h\n"
- "smlal v8.4s, v29.4h, v6.4h\n"
- "ldr x27, [x6, #0xe0]\n"
- "ldr x26, [x6, #0xe8]\n"
- "smlal2 v24.4s, v16.8h, v19.8h\n"
- "smlal v13.4s, v21.4h, v10.4h\n"
- "ldr x25, [x6, #0xf0]\n"
- "ldr x24, [x6, #0xf8]\n"
- "smlal2 v14.4s, v16.8h, v6.8h\n"
- "ldr d16, [x7, #0x70]\n"
- "smlal2 v22.4s, v31.8h, v6.8h\n"
- "usubl v16.8h, v16.8b, v2.8b\n"
- "smlal2 v17.4s, v29.8h, v6.8h\n"
- "ldr d6, [x20, x4]\n"
- "smlal v7.4s, v21.4h, v19.4h\n"
- "ushll v6.8h, v6.8b, #0x0\n"
- "smlal v27.4s, v29.4h, v19.4h\n"
- "smlal v8.4s, v1.4h, v19.4h\n"
- "ldr x23, [x6, #0x100]\n"
- "ldr x22, [x6, #0x108]\n"
- "smlal2 v24.4s, v21.8h, v10.8h\n"
- "smlal v13.4s, v11.4h, v5.4h\n"
- "ldr x21, [x6, #0x110]\n"
- "ldr x20, [x6, #0x118]\n"
- "smlal2 v14.4s, v21.8h, v19.8h\n"
- "ldr d21, [x7, #0x78]\n"
- "smlal2 v22.4s, v29.8h, v19.8h\n"
- "usubl v21.8h, v21.8b, v2.8b\n"
- "smlal2 v17.4s, v1.8h, v19.8h\n"
- "ldr d19, [x12, x4]\n"
- "smlal v7.4s, v9.4h, v10.4h\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "smlal v27.4s, v1.4h, v10.4h\n"
- "smlal v8.4s, v18.4h, v10.4h\n"
- "tst x2, #0x7\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "ldr d11, [x7, #0x80]\n"
- "smlal v13.4s, v15.4h, v23.4h\n"
- "usubl v11.8h, v11.8b, v2.8b\n"
- "smlal2 v14.4s, v9.8h, v10.8h\n"
- "ldr d9, [x11, x4]\n"
- "smlal2 v22.4s, v1.8h, v10.8h\n"
- "ushll v9.8h, v9.8b, #0x0\n"
- "smlal2 v17.4s, v18.8h, v10.8h\n"
- "ldr d10, [x10, x4]\n"
- "smlal v7.4s, v15.4h, v5.4h\n"
+ "smlal v7.4s, v2.4h, v10.4h\n"
+ "smlal2 v19.4s, v2.8h, v10.8h\n"
+ "ldr d10, [x23, x2]\n"
+ "smlal v12.4s, v3.4h, v8.4h\n"
+ "smlal v5.4s, v2.4h, v8.4h\n"
+ "smlal2 v13.4s, v3.8h, v8.8h\n"
+ "smlal2 v23.4s, v2.8h, v8.8h\n"
+ "usubl v25.8h, v25.8b, v14.8b\n"
+ "smlal v30.4s, v3.4h, v9.4h\n"
+ "smlal2 v24.4s, v3.8h, v9.8h\n"
+ "ldr d3, [x5, #0xb8]\n"
"ushll v10.8h, v10.8b, #0x0\n"
- "smlal v27.4s, v4.4h, v5.4h\n"
- "smlal v8.4s, v3.4h, v5.4h\n"
- "smlal2 v24.4s, v15.8h, v23.8h\n"
- "smlal v13.4s, v31.4h, v30.4h\n"
- "smlal2 v14.4s, v15.8h, v5.8h\n"
- "ldr d15, [x7, #0x88]\n"
- "smlal2 v22.4s, v4.8h, v5.8h\n"
- "usubl v15.8h, v15.8b, v2.8b\n"
- "smlal2 v17.4s, v3.8h, v5.8h\n"
- "ldr d5, [x9, x4]\n"
- "smlal v7.4s, v31.4h, v23.4h\n"
- "ushll v5.8h, v5.8b, #0x0\n"
- "smlal v27.4s, v3.4h, v23.4h\n"
- "smlal v8.4s, v0.4h, v23.4h\n"
- "smlal2 v24.4s, v31.8h, v30.8h\n"
- "smlal v13.4s, v29.4h, v28.4h\n"
- "smlal2 v14.4s, v31.8h, v23.8h\n"
- "ldr d31, [x7, #0x90]\n"
- "smlal2 v22.4s, v3.8h, v23.8h\n"
- "usubl v31.8h, v31.8b, v2.8b\n"
- "smlal2 v17.4s, v0.8h, v23.8h\n"
- "ldr d23, [x28, x4]\n"
- "smlal v7.4s, v29.4h, v30.4h\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal v27.4s, v0.4h, v30.4h\n"
- "smlal v8.4s, v20.4h, v30.4h\n"
- "smlal2 v24.4s, v29.8h, v28.8h\n"
- "smlal v13.4s, v1.4h, v16.4h\n"
- "smlal2 v14.4s, v29.8h, v30.8h\n"
- "ldr d29, [x7, #0x98]\n"
- "smlal2 v22.4s, v0.8h, v30.8h\n"
- "usubl v29.8h, v29.8b, v2.8b\n"
- "smlal2 v17.4s, v20.8h, v30.8h\n"
- "ldr d30, [x27, x4]\n"
- "smlal v7.4s, v1.4h, v28.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "smlal v27.4s, v20.4h, v28.4h\n"
- "smlal v8.4s, v6.4h, v28.4h\n"
- "smlal2 v24.4s, v1.8h, v16.8h\n"
- "smlal v13.4s, v4.4h, v21.4h\n"
- "smlal2 v14.4s, v1.8h, v28.8h\n"
- "ldr d1, [x7, #0xa0]\n"
- "smlal2 v22.4s, v20.8h, v28.8h\n"
- "usubl v1.8h, v1.8b, v2.8b\n"
- "smlal2 v17.4s, v6.8h, v28.8h\n"
- "ldr d28, [x26, x4]\n"
- "smlal v7.4s, v18.4h, v16.4h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal v27.4s, v6.4h, v16.4h\n"
- "smlal v8.4s, v19.4h, v16.4h\n"
- "smlal2 v24.4s, v4.8h, v21.8h\n"
- "ldr d4, [x7, #0xa8]\n"
- "smlal v13.4s, v3.4h, v11.4h\n"
- "usubl v4.8h, v4.8b, v2.8b\n"
- "smlal2 v14.4s, v18.8h, v16.8h\n"
- "ldr d18, [x25, x4]\n"
- "smlal2 v22.4s, v6.8h, v16.8h\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "smlal2 v17.4s, v19.8h, v16.8h\n"
- "ldr d16, [x24, x4]\n"
- "smlal v7.4s, v3.4h, v21.4h\n"
+ "smlal v7.4s, v29.4h, v8.4h\n"
+ "smlal2 v19.4s, v29.8h, v8.8h\n"
+ "ldr d8, [x22, x2]\n"
+ "smlal v12.4s, v27.4h, v9.4h\n"
+ "smlal v5.4s, v29.4h, v9.4h\n"
+ "smlal2 v13.4s, v27.8h, v9.8h\n"
+ "smlal2 v23.4s, v29.8h, v9.8h\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v30.4s, v27.4h, v21.4h\n"
+ "smlal2 v24.4s, v27.8h, v21.8h\n"
+ "ldr d27, [x5, #0xc0]\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v7.4s, v6.4h, v9.4h\n"
+ "smlal2 v19.4s, v6.8h, v9.8h\n"
+ "ldr d9, [x21, x2]\n"
+ "smlal v12.4s, v16.4h, v21.4h\n"
+ "smlal v5.4s, v6.4h, v21.4h\n"
+ "smlal2 v13.4s, v16.8h, v21.8h\n"
+ "smlal2 v23.4s, v6.8h, v21.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v30.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v16.8h, v1.8h\n"
+ "ldr d16, [x20, x2]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v19.4s, v4.8h, v21.8h\n"
+ "ldr q21, [x6, #0x0]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v12.4s, v18.4h, v1.4h\n"
+ "smlal v5.4s, v4.4h, v1.4h\n"
+ "smlal2 v13.4s, v18.8h, v1.8h\n"
+ "ldr q18, [x7, #0x0]\n"
+ "smlal2 v23.4s, v4.8h, v1.8h\n"
"ushll v16.8h, v16.8b, #0x0\n"
- "smlal v27.4s, v9.4h, v21.4h\n"
- "smlal v8.4s, v10.4h, v21.4h\n"
- "smlal2 v24.4s, v3.8h, v11.8h\n"
- "smlal v13.4s, v0.4h, v15.4h\n"
- "smlal2 v14.4s, v3.8h, v21.8h\n"
- "ldr d3, [x7, #0xb0]\n"
- "smlal2 v22.4s, v9.8h, v21.8h\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "smlal2 v17.4s, v10.8h, v21.8h\n"
- "ldr d21, [x23, x4]\n"
- "smlal v7.4s, v0.4h, v11.4h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "smlal v27.4s, v10.4h, v11.4h\n"
- "smlal v8.4s, v5.4h, v11.4h\n"
- "smlal2 v24.4s, v0.8h, v15.8h\n"
- "smlal v13.4s, v20.4h, v31.4h\n"
- "smlal2 v14.4s, v0.8h, v11.8h\n"
- "ldr d0, [x7, #0xb8]\n"
- "smlal2 v22.4s, v10.8h, v11.8h\n"
- "usubl v0.8h, v0.8b, v2.8b\n"
- "smlal2 v17.4s, v5.8h, v11.8h\n"
- "ldr d11, [x22, x4]\n"
- "smlal v7.4s, v20.4h, v15.4h\n"
- "ushll v11.8h, v11.8b, #0x0\n"
- "smlal v27.4s, v5.4h, v15.4h\n"
- "smlal v8.4s, v23.4h, v15.4h\n"
- "smlal2 v24.4s, v20.8h, v31.8h\n"
- "smlal v13.4s, v6.4h, v29.4h\n"
- "smlal2 v14.4s, v20.8h, v15.8h\n"
- "ldr d20, [x7, #0xc0]\n"
- "smlal2 v22.4s, v5.8h, v15.8h\n"
- "usubl v20.8h, v20.8b, v2.8b\n"
- "smlal2 v17.4s, v23.8h, v15.8h\n"
- "ldr d15, [x21, x4]\n"
- "smlal v7.4s, v6.4h, v31.4h\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal v27.4s, v23.4h, v31.4h\n"
- "smlal v8.4s, v30.4h, v31.4h\n"
- "smlal2 v24.4s, v6.8h, v29.8h\n"
- "smlal v13.4s, v9.4h, v1.4h\n"
- "smlal2 v14.4s, v6.8h, v31.8h\n"
- "ldr d6, [x20, x4]\n"
- "smlal2 v22.4s, v23.8h, v31.8h\n"
- "ushll v6.8h, v6.8b, #0x0\n"
- "smlal2 v17.4s, v30.8h, v31.8h\n"
- "ldr q31, [x8, #0x0]\n"
- "smlal v7.4s, v19.4h, v29.4h\n"
- "add x4, x4, #0x8\n"
- "smlal v27.4s, v30.4h, v29.4h\n"
- "smlal v8.4s, v28.4h, v29.4h\n"
- "smlal2 v24.4s, v9.8h, v1.8h\n"
- "ldr q9, [x17, #0x0]\n"
- "smlal v13.4s, v10.4h, v4.4h\n"
- "smlal2 v14.4s, v19.8h, v29.8h\n"
- "ldr q19, [x8, #0x10]\n"
- "smlal2 v22.4s, v30.8h, v29.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v17.4s, v28.8h, v29.8h\n"
- "ldr q29, [x17, #0x10]\n"
- "smlal v7.4s, v10.4h, v1.4h\n"
- "add x17, x17, #0x20\n"
- "smlal v27.4s, v18.4h, v1.4h\n"
- "smlal v8.4s, v16.4h, v1.4h\n"
- "smlal2 v24.4s, v10.8h, v4.8h\n"
- "smlal v13.4s, v5.4h, v3.4h\n"
- "smlal2 v14.4s, v10.8h, v1.8h\n"
- "smlal2 v22.4s, v18.8h, v1.8h\n"
- "smlal2 v17.4s, v16.8h, v1.8h\n"
- "smlal v7.4s, v5.4h, v4.4h\n"
- "smlal v27.4s, v16.4h, v4.4h\n"
- "smlal v8.4s, v21.4h, v4.4h\n"
- "smlal2 v24.4s, v5.8h, v3.8h\n"
- "smlal v13.4s, v23.4h, v0.4h\n"
- "smlal2 v14.4s, v5.8h, v4.8h\n"
- "smlal2 v22.4s, v16.8h, v4.8h\n"
- "smlal2 v17.4s, v21.8h, v4.8h\n"
- "smlal v7.4s, v23.4h, v3.4h\n"
- "smlal v27.4s, v21.4h, v3.4h\n"
- "smlal v8.4s, v11.4h, v3.4h\n"
- "smlal2 v24.4s, v23.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v20.4h\n"
- "sqrdmulh v13.4s, v13.4s, v31.4s\n"
- "smlal2 v14.4s, v23.8h, v3.8h\n"
- "smlal2 v22.4s, v21.8h, v3.8h\n"
- "and v21.16b, v13.16b, v9.16b\n"
- "smlal2 v17.4s, v11.8h, v3.8h\n"
- "smlal v7.4s, v30.4h, v0.4h\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "smlal v27.4s, v11.4h, v0.4h\n"
- "smlal v8.4s, v15.4h, v0.4h\n"
- "sqadd v13.4s, v13.4s, v21.4s\n"
- "smlal2 v24.4s, v30.8h, v20.8h\n"
- "smlal2 v14.4s, v30.8h, v0.8h\n"
- "sqrdmulh v24.4s, v24.4s, v19.4s\n"
- "smlal2 v22.4s, v11.8h, v0.8h\n"
- "smlal2 v17.4s, v15.8h, v0.8h\n"
- "and v16.16b, v24.16b, v29.16b\n"
- "smlal v7.4s, v28.4h, v20.4h\n"
- "smlal v27.4s, v15.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v31.4s\n"
- "smlal v8.4s, v6.4h, v20.4h\n"
- "smlal2 v14.4s, v28.8h, v20.8h\n"
- "sqrdmulh v27.4s, v27.4s, v31.4s\n"
- "smlal2 v22.4s, v15.8h, v20.8h\n"
- "smlal2 v17.4s, v6.8h, v20.8h\n"
- "sqrdmulh v8.4s, v8.4s, v31.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v23.16b, v7.16b, v9.16b\n"
- "sqrdmulh v14.4s, v14.4s, v19.4s\n"
- "and v20.16b, v27.16b, v9.16b\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "and v3.16b, v8.16b, v9.16b\n"
- "sqrdmulh v17.4s, v17.4s, v19.4s\n"
- "sqadd v24.4s, v24.4s, v16.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v18.16b, v14.16b, v29.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v19.16b, v22.16b, v29.16b\n"
+ "smlal v30.4s, v26.4h, v22.4h\n"
+ "smlal2 v24.4s, v26.8h, v22.8h\n"
+ "ldr q26, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v7.4s, v11.4h, v1.4h\n"
+ "smlal2 v19.4s, v11.8h, v1.8h\n"
+ "ldr q1, [x7, #0x10]\n"
+ "add x7, x7, #0x20\n"
+ "smlal v12.4s, v2.4h, v22.4h\n"
+ "smlal v5.4s, v17.4h, v22.4h\n"
+ "smlal2 v13.4s, v2.8h, v22.8h\n"
+ "smlal2 v23.4s, v17.8h, v22.8h\n"
+ "smlal v30.4s, v2.4h, v0.4h\n"
+ "smlal2 v24.4s, v2.8h, v0.8h\n"
+ "smlal v7.4s, v20.4h, v22.4h\n"
+ "smlal2 v19.4s, v20.8h, v22.8h\n"
+ "smlal v12.4s, v29.4h, v0.4h\n"
+ "smlal v5.4s, v20.4h, v0.4h\n"
+ "smlal2 v13.4s, v29.8h, v0.8h\n"
+ "smlal2 v23.4s, v20.8h, v0.8h\n"
+ "smlal v30.4s, v29.4h, v25.4h\n"
+ "smlal2 v24.4s, v29.8h, v25.8h\n"
+ "smlal v7.4s, v10.4h, v0.4h\n"
+ "smlal2 v19.4s, v10.8h, v0.8h\n"
+ "smlal v12.4s, v6.4h, v25.4h\n"
+ "smlal v5.4s, v10.4h, v25.4h\n"
+ "smlal2 v13.4s, v6.8h, v25.8h\n"
+ "smlal2 v23.4s, v10.8h, v25.8h\n"
+ "smlal v30.4s, v6.4h, v3.4h\n"
+ "smlal2 v24.4s, v6.8h, v3.8h\n"
+ "smlal v7.4s, v8.4h, v25.4h\n"
+ "smlal2 v19.4s, v8.8h, v25.8h\n"
+ "smlal v12.4s, v4.4h, v3.4h\n"
+ "smlal v5.4s, v8.4h, v3.4h\n"
+ "smlal2 v13.4s, v4.8h, v3.8h\n"
+ "smlal2 v23.4s, v8.8h, v3.8h\n"
+ "smlal v30.4s, v4.4h, v27.4h\n"
+ "smlal2 v24.4s, v4.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v3.4h\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "smlal v12.4s, v11.4h, v27.4h\n"
+ "smlal v5.4s, v9.4h, v27.4h\n"
+ "smlal2 v13.4s, v11.8h, v27.8h\n"
+ "smlal2 v23.4s, v9.8h, v27.8h\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "and v17.16b, v30.16b, v18.16b\n"
+ "sqrdmulh v12.4s, v12.4s, v21.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v21.4s\n"
+ "and v22.16b, v24.16b, v1.16b\n"
+ "sqrdmulh v13.4s, v13.4s, v26.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v26.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v7.4s, v7.4s, v21.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v6.16b, v12.16b, v18.16b\n"
+ "and v27.16b, v5.16b, v18.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "and v3.16b, v7.16b, v18.16b\n"
+ "sqadd v24.4s, v24.4s, v22.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v4.16b, v13.16b, v1.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v17.16b, v23.16b, v1.16b\n"
"sshr v3.4s, v3.4s, #0x1f\n"
- "and v30.16b, v17.16b, v29.16b\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v20.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v3.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v9.4s\n"
- "srshl v7.4s, v7.4s, v9.4s\n"
- "sqadd v14.4s, v14.4s, v18.4s\n"
- "srshl v27.4s, v27.4s, v9.4s\n"
- "sqadd v22.4s, v22.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v9.4s\n"
- "sqadd v17.4s, v17.4s, v30.4s\n"
- "srshl v24.4s, v24.4s, v29.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v14.4s, v14.4s, v29.4s\n"
+ "and v16.16b, v19.16b, v1.16b\n"
+ "sqadd v12.4s, v12.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v27.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "sqadd v13.4s, v13.4s, v4.4s\n"
+ "srshl v5.4s, v5.4s, v18.4s\n"
+ "sqadd v23.4s, v23.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v18.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "sqxtn v30.4h, v30.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v19.4s, v19.4s, v1.4s\n"
"sqxtn v7.4h, v7.4s\n"
- "srshl v22.4s, v22.4s, v29.4s\n"
- "sqxtn v27.4h, v27.4s\n"
- "srshl v17.4s, v17.4s, v29.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "sqxtn2 v13.8h, v24.4s\n"
- "sqxtn2 v7.8h, v14.4s\n"
- "sqxtn2 v27.8h, v22.4s\n"
- "sqxtn2 v8.8h, v17.4s\n"
- "sqadd v13.8h, v13.8h, v25.8h\n"
- "sqadd v7.8h, v7.8h, v25.8h\n"
- "sqadd v27.8h, v27.8h, v25.8h\n"
- "sqadd v8.8h, v8.8h, v25.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smax v27.8h, v27.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v26.8h\n"
- "smin v7.8h, v7.8h, v26.8h\n"
- "smin v27.8h, v27.8h, v26.8h\n"
- "smin v8.8h, v8.8h, v26.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x16, x5]\n"
+ "sqxtn2 v30.8h, v24.4s\n"
+ "sqxtn2 v12.8h, v13.4s\n"
+ "sqxtn2 v5.8h, v23.4s\n"
+ "sqxtn2 v7.8h, v19.4s\n"
+ "sqadd v30.8h, v30.8h, v15.8h\n"
+ "sqadd v12.8h, v12.8h, v15.8h\n"
+ "sqadd v5.8h, v5.8h, v15.8h\n"
+ "sqadd v7.8h, v7.8h, v15.8h\n"
+ "smax v30.8h, v30.8h, v31.8h\n"
+ "smax v12.8h, v12.8h, v31.8h\n"
+ "smax v5.8h, v5.8h, v31.8h\n"
+ "smax v7.8h, v7.8h, v31.8h\n"
+ "smin v30.8h, v30.8h, v28.8h\n"
+ "smin v12.8h, v12.8h, v28.8h\n"
+ "smin v5.8h, v5.8h, v28.8h\n"
+ "smin v7.8h, v7.8h, v28.8h\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "str d7, [x15, x5]\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d27, [x14, x5]\n"
- "str d8, [x13, x5]\n"
- "add x5, x5, #0x8\n"
+ "str d30, [x17, x3]\n"
+ "str d12, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "str d7, [x14, x3]\n"
+ "add x3, x3, #0x8\n"
"beq 124f\n"
- "add x7, x7, #0xc8\n"
+ "add x5, x5, #0xc8\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x2, #2, 5f\n"
- "ld1 { v13.4s }, [x20], #0x10\n"
- "tbz x2, #1, 4f\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v30.4s }, [x20], #0x10\n"
+ "tbz x1, #1, 4f\n"
"ld1 { v24.d }[0], [x20], #0x8\n"
- "tbz x2, #0, 7f\n"
+ "tbz x1, #0, 7f\n"
"ld1 { v24.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x2, #0, 7f\n"
+ "tbz x1, #0, 7f\n"
"ld1 { v24.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x2, #1, 6f\n"
- "ld1 { v13.d }[0], [x20], #0x8\n"
- "tbz x2, #0, 7f\n"
- "ld1 { v13.s }[2], [x20]\n"
+ "tbz x1, #1, 6f\n"
+ "ld1 { v30.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v30.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 7f\n"
- "ld1 { v13.s }[0], [x20]\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v30.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d21, [x7, #0x0]\n"
- "ldr d15, [x7, #0x8]\n"
- "mov v7.16b, v13.16b\n"
- "mov v14.16b, v24.16b\n"
- "ldr d29, [x7, #0x10]\n"
- "ldr d18, [x7, #0x18]\n"
- "mov v27.16b, v13.16b\n"
- "mov v22.16b, v24.16b\n"
- "ldr d3, [x7, #0x20]\n"
- "ldp x9, x28, [x6, #0x0]\n"
- "mov v8.16b, v13.16b\n"
- "mov v17.16b, v24.16b\n"
- "ldp x27, x26, [x6, #0x10]\n"
- "ldp x25, x24, [x6, #0x20]\n"
- "usubl v21.8h, v21.8b, v2.8b\n"
- "usubl v15.8h, v15.8b, v2.8b\n"
- "ldp x23, x22, [x6, #0x30]\n"
- "ldp x21, x20, [x6, #0x40]\n"
- "usubl v29.8h, v29.8b, v2.8b\n"
- "usubl v18.8h, v18.8b, v2.8b\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "add x9, x9, x4\n"
- "add x28, x28, x4\n"
- "add x27, x27, x4\n"
- "add x26, x26, x4\n"
- "add x25, x25, x4\n"
- "add x24, x24, x4\n"
- "add x23, x23, x4\n"
- "add x22, x22, x4\n"
- "add x21, x21, x4\n"
- "add x20, x20, x4\n"
- "tbz x2, #2, 9f\n"
- "ld1 { v10.s }[0], [x9], #0x4\n"
- "ld1 { v16.s }[0], [x28], #0x4\n"
- "ld1 { v23.s }[0], [x27], #0x4\n"
- "ld1 { v30.s }[0], [x26], #0x4\n"
- "ld1 { v4.s }[0], [x25], #0x4\n"
- "ld1 { v28.s }[0], [x24], #0x4\n"
- "ld1 { v31.s }[0], [x23], #0x4\n"
- "ld1 { v1.s }[0], [x22], #0x4\n"
- "ld1 { v9.s }[0], [x21], #0x4\n"
- "ld1 { v11.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 8f\n"
- "ld1 { v10.h }[2], [x9], #0x2\n"
- "ld1 { v16.h }[2], [x28], #0x2\n"
- "ld1 { v23.h }[2], [x27], #0x2\n"
- "ld1 { v30.h }[2], [x26], #0x2\n"
- "ld1 { v4.h }[2], [x25], #0x2\n"
- "ld1 { v28.h }[2], [x24], #0x2\n"
- "ld1 { v31.h }[2], [x23], #0x2\n"
- "ld1 { v1.h }[2], [x22], #0x2\n"
- "ld1 { v9.h }[2], [x21], #0x2\n"
- "ld1 { v11.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 11f\n"
- "ld1 { v10.b }[6], [x9]\n"
- "ld1 { v16.b }[6], [x28]\n"
- "ld1 { v23.b }[6], [x27]\n"
- "ld1 { v30.b }[6], [x26]\n"
- "ld1 { v4.b }[6], [x25]\n"
- "ld1 { v28.b }[6], [x24]\n"
- "ld1 { v31.b }[6], [x23]\n"
- "ld1 { v1.b }[6], [x22]\n"
- "ld1 { v9.b }[6], [x21]\n"
- "ld1 { v11.b }[6], [x20]\n"
+ "ldr d6, [x5, #0x0]\n"
+ "ldr d20, [x5, #0x8]\n"
+ "mov v12.16b, v30.16b\n"
+ "mov v13.16b, v24.16b\n"
+ "ldr d9, [x5, #0x10]\n"
+ "ldr d1, [x5, #0x18]\n"
+ "mov v5.16b, v30.16b\n"
+ "mov v23.16b, v24.16b\n"
+ "ldr d17, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v7.16b, v30.16b\n"
+ "mov v19.16b, v24.16b\n"
+ "usubl v6.8h, v6.8b, v14.8b\n"
+ "usubl v20.8h, v20.8b, v14.8b\n"
+ "usubl v9.8h, v9.8b, v14.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "add x9, x9, x2\n"
+ "add x28, x28, x2\n"
+ "add x27, x27, x2\n"
+ "add x26, x26, x2\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "add x25, x25, x2\n"
+ "add x24, x24, x2\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "add x23, x23, x2\n"
+ "add x22, x22, x2\n"
+ "add x21, x21, x2\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v18.s }[0], [x9], #0x4\n"
+ "ld1 { v4.s }[0], [x28], #0x4\n"
+ "ld1 { v0.s }[0], [x27], #0x4\n"
+ "ld1 { v25.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v22.s }[0], [x23], #0x4\n"
+ "ld1 { v21.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v18.h }[2], [x9], #0x2\n"
+ "ld1 { v4.h }[2], [x28], #0x2\n"
+ "ld1 { v0.h }[2], [x27], #0x2\n"
+ "ld1 { v25.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v8.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v18.b }[6], [x9]\n"
+ "ld1 { v4.b }[6], [x28]\n"
+ "ld1 { v0.b }[6], [x27]\n"
+ "ld1 { v25.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x25]\n"
+ "ld1 { v11.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v21.b }[6], [x22]\n"
+ "ld1 { v8.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x2, #0, 11f\n"
- "ld1 { v10.b }[4], [x9]\n"
- "ld1 { v16.b }[4], [x28]\n"
- "ld1 { v23.b }[4], [x27]\n"
- "ld1 { v30.b }[4], [x26]\n"
- "ld1 { v4.b }[4], [x25]\n"
- "ld1 { v28.b }[4], [x24]\n"
- "ld1 { v31.b }[4], [x23]\n"
- "ld1 { v1.b }[4], [x22]\n"
- "ld1 { v9.b }[4], [x21]\n"
- "ld1 { v11.b }[4], [x20]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v18.b }[4], [x9]\n"
+ "ld1 { v4.b }[4], [x28]\n"
+ "ld1 { v0.b }[4], [x27]\n"
+ "ld1 { v25.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x25]\n"
+ "ld1 { v11.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v21.b }[4], [x22]\n"
+ "ld1 { v8.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x2, #1, 10f\n"
- "ld1 { v10.h }[0], [x9], #0x2\n"
- "ld1 { v16.h }[0], [x28], #0x2\n"
- "ld1 { v23.h }[0], [x27], #0x2\n"
- "ld1 { v30.h }[0], [x26], #0x2\n"
- "ld1 { v4.h }[0], [x25], #0x2\n"
- "ld1 { v28.h }[0], [x24], #0x2\n"
- "ld1 { v31.h }[0], [x23], #0x2\n"
- "ld1 { v1.h }[0], [x22], #0x2\n"
- "ld1 { v9.h }[0], [x21], #0x2\n"
- "ld1 { v11.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 11f\n"
- "ld1 { v10.b }[2], [x9]\n"
- "ld1 { v16.b }[2], [x28]\n"
- "ld1 { v23.b }[2], [x27]\n"
- "ld1 { v30.b }[2], [x26]\n"
- "ld1 { v4.b }[2], [x25]\n"
- "ld1 { v28.b }[2], [x24]\n"
- "ld1 { v31.b }[2], [x23]\n"
- "ld1 { v1.b }[2], [x22]\n"
- "ld1 { v9.b }[2], [x21]\n"
- "ld1 { v11.b }[2], [x20]\n"
+ "tbz x1, #1, 10f\n"
+ "ld1 { v18.h }[0], [x9], #0x2\n"
+ "ld1 { v4.h }[0], [x28], #0x2\n"
+ "ld1 { v0.h }[0], [x27], #0x2\n"
+ "ld1 { v25.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v22.h }[0], [x23], #0x2\n"
+ "ld1 { v21.h }[0], [x22], #0x2\n"
+ "ld1 { v8.h }[0], [x21], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v18.b }[2], [x9]\n"
+ "ld1 { v4.b }[2], [x28]\n"
+ "ld1 { v0.b }[2], [x27]\n"
+ "ld1 { v25.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x25]\n"
+ "ld1 { v11.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v21.b }[2], [x22]\n"
+ "ld1 { v8.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 11f\n"
- "ld1 { v10.b }[0], [x9]\n"
- "ld1 { v16.b }[0], [x28]\n"
- "ld1 { v23.b }[0], [x27]\n"
- "ld1 { v30.b }[0], [x26]\n"
- "ld1 { v4.b }[0], [x25]\n"
- "ld1 { v28.b }[0], [x24]\n"
- "ld1 { v31.b }[0], [x23]\n"
- "ld1 { v1.b }[0], [x22]\n"
- "ld1 { v9.b }[0], [x21]\n"
- "ld1 { v11.b }[0], [x20]\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v18.b }[0], [x9]\n"
+ "ld1 { v4.b }[0], [x28]\n"
+ "ld1 { v0.b }[0], [x27]\n"
+ "ld1 { v25.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x25]\n"
+ "ld1 { v11.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x23]\n"
+ "ld1 { v21.b }[0], [x22]\n"
+ "ld1 { v8.b }[0], [x21]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ushll v10.8h, v10.8b, #0x0\n"
- "ushll v16.8h, v16.8b, #0x0\n"
- "smlal v13.4s, v10.4h, v21.4h\n"
- "ldr x20, [x6, #0x50]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "smlal2 v24.4s, v10.8h, v21.8h\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "smlal2 v14.4s, v16.8h, v21.8h\n"
- "smlal v27.4s, v23.4h, v21.4h\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "add x20, x20, x4\n"
- "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
"ushll v4.8h, v4.8b, #0x0\n"
- "smlal v8.4s, v30.4h, v21.4h\n"
- "smlal2 v17.4s, v30.8h, v21.8h\n"
- "smlal v13.4s, v16.4h, v15.4h\n"
- "ushll v28.8h, v28.8b, #0x0\n"
- "smlal2 v24.4s, v16.8h, v15.8h\n"
- "smlal v7.4s, v4.4h, v15.4h\n"
- "ushll v31.8h, v31.8b, #0x0\n"
- "smlal2 v14.4s, v4.8h, v15.8h\n"
- "smlal v27.4s, v30.4h, v15.4h\n"
- "ushll v1.8h, v1.8b, #0x0\n"
- "smlal2 v22.4s, v30.8h, v15.8h\n"
- "ushll v9.8h, v9.8b, #0x0\n"
- "smlal v8.4s, v28.4h, v15.4h\n"
+ "ldr x20, [x4, #0x50]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
"ushll v11.8h, v11.8b, #0x0\n"
- "smlal2 v17.4s, v28.8h, v15.8h\n"
- "smlal v13.4s, v4.4h, v29.4h\n"
- "smlal2 v24.4s, v4.8h, v29.8h\n"
- "smlal v7.4s, v31.4h, v29.4h\n"
- "smlal2 v14.4s, v31.8h, v29.8h\n"
- "smlal v27.4s, v28.4h, v29.4h\n"
- "smlal2 v22.4s, v28.8h, v29.8h\n"
- "tbz x2, #2, 13f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 12f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 15f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "smlal v30.4s, v18.4h, v6.4h\n"
+ "smlal2 v24.4s, v18.8h, v6.8h\n"
+ "smlal v12.4s, v4.4h, v6.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "add x20, x20, x2\n"
+ "smlal2 v13.4s, v4.8h, v6.8h\n"
+ "smlal v5.4s, v0.4h, v6.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal2 v23.4s, v0.8h, v6.8h\n"
+ "smlal v7.4s, v25.4h, v6.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal2 v19.4s, v25.8h, v6.8h\n"
+ "smlal v30.4s, v4.4h, v20.4h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal2 v24.4s, v4.8h, v20.8h\n"
+ "smlal v12.4s, v10.4h, v20.4h\n"
+ "smlal2 v13.4s, v10.8h, v20.8h\n"
+ "smlal v5.4s, v25.4h, v20.4h\n"
+ "smlal2 v23.4s, v25.8h, v20.8h\n"
+ "smlal v7.4s, v11.4h, v20.4h\n"
+ "smlal2 v19.4s, v11.8h, v20.8h\n"
+ "smlal v30.4s, v10.4h, v9.4h\n"
+ "smlal2 v24.4s, v10.8h, v9.8h\n"
+ "smlal v12.4s, v22.4h, v9.4h\n"
+ "smlal2 v13.4s, v22.8h, v9.8h\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "smlal2 v23.4s, v11.8h, v9.8h\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x2, #0, 15f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x2, #1, 14f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 15f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x1, #1, 14f\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 15f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v2.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ushll v5.8h, v5.8b, #0x0\n"
- "ldr x20, [x6, #0x58]\n"
- "smlal v8.4s, v5.4h, v29.4h\n"
- "smlal2 v17.4s, v5.8h, v29.8h\n"
- "smlal v13.4s, v31.4h, v18.4h\n"
- "smlal2 v24.4s, v31.8h, v18.8h\n"
- "add x20, x20, x4\n"
- "smlal v7.4s, v1.4h, v18.4h\n"
- "smlal2 v14.4s, v1.8h, v18.8h\n"
- "smlal v27.4s, v5.4h, v18.4h\n"
- "smlal2 v22.4s, v5.8h, v18.8h\n"
- "tbz x2, #2, 17f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 16f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 19f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ushll v2.8h, v2.8b, #0x0\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v30.4s, v22.4h, v1.4h\n"
+ "smlal2 v24.4s, v22.8h, v1.8h\n"
+ "smlal v12.4s, v21.4h, v1.4h\n"
+ "smlal2 v13.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v2.4h, v9.4h\n"
+ "smlal2 v19.4s, v2.8h, v9.8h\n"
+ "smlal v5.4s, v2.4h, v1.4h\n"
+ "smlal2 v23.4s, v2.8h, v1.8h\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v22.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x2, #0, 19f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v22.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x2, #1, 18f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 19f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x1, #1, 18f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v22.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 19f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v22.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr x20, [x6, #0x60]\n"
- "smlal v8.4s, v10.4h, v18.4h\n"
- "smlal2 v17.4s, v10.8h, v18.8h\n"
- "smlal v13.4s, v1.4h, v3.4h\n"
- "smlal2 v24.4s, v1.8h, v3.8h\n"
- "add x20, x20, x4\n"
- "tbz x2, #2, 21f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 20f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 23f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "ldr x20, [x4, #0x60]\n"
+ "smlal v30.4s, v21.4h, v17.4h\n"
+ "smlal2 v24.4s, v21.8h, v17.8h\n"
+ "smlal v7.4s, v22.4h, v1.4h\n"
+ "smlal2 v19.4s, v22.8h, v1.8h\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x2, #0, 23f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x2, #1, 22f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 23f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x1, #1, 22f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 23f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d6, [x7, #0x28]\n"
- "ushll v15.8h, v15.8b, #0x0\n"
- "smlal v7.4s, v15.4h, v3.4h\n"
- "smlal2 v14.4s, v15.8h, v3.8h\n"
- "smlal v27.4s, v10.4h, v3.4h\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "usubl v6.8h, v6.8b, v2.8b\n"
- "ldr x20, [x6, #0x68]\n"
- "smlal v8.4s, v9.4h, v3.4h\n"
- "smlal2 v17.4s, v9.8h, v3.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v23.4h, v6.4h\n"
- "smlal2 v24.4s, v23.8h, v6.8h\n"
- "smlal v7.4s, v30.4h, v6.4h\n"
- "smlal2 v14.4s, v30.8h, v6.8h\n"
- "smlal v27.4s, v11.4h, v6.4h\n"
- "smlal2 v22.4s, v11.8h, v6.8h\n"
- "tbz x2, #2, 25f\n"
- "ld1 { v20.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 24f\n"
- "ld1 { v20.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 27f\n"
- "ld1 { v20.b }[6], [x20]\n"
+ "ldr d18, [x5, #0x28]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v5.4s, v22.4h, v17.4h\n"
+ "smlal2 v23.4s, v22.8h, v17.8h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v7.4s, v8.4h, v17.4h\n"
+ "smlal2 v19.4s, v8.8h, v17.8h\n"
+ "smlal v12.4s, v16.4h, v17.4h\n"
+ "smlal2 v13.4s, v16.8h, v17.8h\n"
+ "usubl v18.8h, v18.8b, v14.8b\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v0.4h, v18.4h\n"
+ "smlal2 v24.4s, v0.8h, v18.8h\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v23.4s, v26.8h, v18.8h\n"
+ "smlal v12.4s, v25.4h, v18.4h\n"
+ "smlal2 v13.4s, v25.8h, v18.8h\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x2, #0, 27f\n"
- "ld1 { v20.b }[4], [x20]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x2, #1, 26f\n"
- "ld1 { v20.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 27f\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "tbz x1, #1, 26f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 27f\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v29.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d4, [x7, #0x30]\n"
- "ushll v20.8h, v20.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v2.8b\n"
- "ldr x20, [x6, #0x70]\n"
- "smlal v8.4s, v20.4h, v6.4h\n"
- "smlal2 v17.4s, v20.8h, v6.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v24.4s, v30.8h, v4.8h\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v14.4s, v28.8h, v4.8h\n"
- "smlal v27.4s, v20.4h, v4.4h\n"
- "smlal2 v22.4s, v20.8h, v4.8h\n"
- "tbz x2, #2, 29f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 28f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 31f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x30]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x4, #0x70]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v29.4h, v18.4h\n"
+ "smlal2 v19.4s, v29.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v25.4h, v16.4h\n"
+ "smlal2 v24.4s, v25.8h, v16.8h\n"
+ "smlal v12.4s, v11.4h, v16.4h\n"
+ "smlal2 v13.4s, v11.8h, v16.8h\n"
+ "smlal v5.4s, v29.4h, v16.4h\n"
+ "smlal2 v23.4s, v29.8h, v16.8h\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x2, #0, 31f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x2, #1, 30f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 31f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x1, #1, 30f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 31f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d30, [x7, #0x38]\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "usubl v30.8h, v30.8b, v2.8b\n"
- "ldr x20, [x6, #0x78]\n"
- "smlal v8.4s, v23.4h, v4.4h\n"
- "smlal2 v17.4s, v23.8h, v4.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v28.4h, v30.4h\n"
- "smlal2 v24.4s, v28.8h, v30.8h\n"
- "smlal v7.4s, v5.4h, v30.4h\n"
- "smlal2 v14.4s, v5.8h, v30.8h\n"
- "smlal v27.4s, v23.4h, v30.4h\n"
- "smlal2 v22.4s, v23.8h, v30.8h\n"
- "tbz x2, #2, 33f\n"
- "ld1 { v3.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 32f\n"
- "ld1 { v3.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 35f\n"
- "ld1 { v3.b }[6], [x20]\n"
+ "ldr d17, [x5, #0x38]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ldr x20, [x4, #0x78]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v9.4h, v16.4h\n"
+ "smlal2 v19.4s, v9.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v11.4h, v17.4h\n"
+ "smlal2 v24.4s, v11.8h, v17.8h\n"
+ "smlal v12.4s, v2.4h, v17.4h\n"
+ "smlal2 v13.4s, v2.8h, v17.8h\n"
+ "smlal v5.4s, v9.4h, v17.4h\n"
+ "smlal2 v23.4s, v9.8h, v17.8h\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x2, #0, 35f\n"
- "ld1 { v3.b }[4], [x20]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x2, #1, 34f\n"
- "ld1 { v3.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 35f\n"
- "ld1 { v3.b }[2], [x20]\n"
+ "tbz x1, #1, 34f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 35f\n"
- "ld1 { v3.b }[0], [x20]\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v6.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d16, [x7, #0x40]\n"
- "ushll v3.8h, v3.8b, #0x0\n"
- "usubl v16.8h, v16.8b, v2.8b\n"
- "ldr x20, [x6, #0x80]\n"
- "smlal v8.4s, v3.4h, v30.4h\n"
- "smlal2 v17.4s, v3.8h, v30.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v5.4h, v16.4h\n"
- "smlal2 v24.4s, v5.8h, v16.8h\n"
- "smlal v7.4s, v10.4h, v16.4h\n"
- "smlal2 v14.4s, v10.8h, v16.8h\n"
- "smlal v27.4s, v3.4h, v16.4h\n"
- "smlal2 v22.4s, v3.8h, v16.8h\n"
- "tbz x2, #2, 37f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 36f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 39f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x40]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x4, #0x80]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v6.4h, v17.4h\n"
+ "smlal2 v19.4s, v6.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v2.4h, v16.4h\n"
+ "smlal2 v24.4s, v2.8h, v16.8h\n"
+ "smlal v12.4s, v22.4h, v16.4h\n"
+ "smlal2 v13.4s, v22.8h, v16.8h\n"
+ "smlal v5.4s, v6.4h, v16.4h\n"
+ "smlal2 v23.4s, v6.8h, v16.8h\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x2, #0, 39f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x2, #1, 38f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 39f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "tbz x1, #1, 38f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 39f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d1, [x7, #0x48]\n"
- "ushll v6.8h, v6.8b, #0x0\n"
- "usubl v1.8h, v1.8b, v2.8b\n"
- "ldr x20, [x6, #0x88]\n"
- "smlal v8.4s, v6.4h, v16.4h\n"
- "smlal2 v17.4s, v6.8h, v16.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v10.4h, v1.4h\n"
- "smlal2 v24.4s, v10.8h, v1.8h\n"
- "smlal v7.4s, v9.4h, v1.4h\n"
- "smlal2 v14.4s, v9.8h, v1.8h\n"
- "smlal v27.4s, v6.4h, v1.4h\n"
- "smlal2 v22.4s, v6.8h, v1.8h\n"
- "tbz x2, #2, 41f\n"
- "ld1 { v18.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 40f\n"
- "ld1 { v18.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 43f\n"
- "ld1 { v18.b }[6], [x20]\n"
+ "ldr d4, [x5, #0x48]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x20, [x4, #0x88]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v7.4s, v25.4h, v16.4h\n"
+ "smlal2 v19.4s, v25.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v22.4h, v4.4h\n"
+ "smlal2 v24.4s, v22.8h, v4.8h\n"
+ "smlal v12.4s, v8.4h, v4.4h\n"
+ "smlal2 v13.4s, v8.8h, v4.8h\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v23.4s, v25.8h, v4.8h\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x2, #0, 43f\n"
- "ld1 { v18.b }[4], [x20]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x2, #1, 42f\n"
- "ld1 { v18.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 43f\n"
- "ld1 { v18.b }[2], [x20]\n"
+ "tbz x1, #1, 42f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 43f\n"
- "ld1 { v18.b }[0], [x20]\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v20.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d28, [x7, #0x50]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "usubl v28.8h, v28.8b, v2.8b\n"
- "ldr x20, [x6, #0x90]\n"
- "smlal v8.4s, v18.4h, v1.4h\n"
- "smlal2 v17.4s, v18.8h, v1.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v11.4h, v28.4h\n"
- "smlal2 v24.4s, v11.8h, v28.8h\n"
- "smlal v7.4s, v20.4h, v28.4h\n"
- "smlal2 v14.4s, v20.8h, v28.8h\n"
- "tbz x2, #2, 45f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 44f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 47f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x50]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x4, #0x90]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v20.4h, v4.4h\n"
+ "smlal2 v19.4s, v20.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v26.4h, v16.4h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "smlal v12.4s, v29.4h, v16.4h\n"
+ "smlal2 v13.4s, v29.8h, v16.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x2, #0, 47f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x2, #1, 46f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 47f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "tbz x1, #1, 46f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 47f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v21.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ushll v30.8h, v30.8b, #0x0\n"
- "ldr x20, [x6, #0x98]\n"
- "smlal v27.4s, v30.4h, v28.4h\n"
- "smlal2 v22.4s, v30.8h, v28.8h\n"
- "add x20, x20, x4\n"
- "tbz x2, #2, 49f\n"
- "ld1 { v19.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 48f\n"
- "ld1 { v19.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 51f\n"
- "ld1 { v19.b }[6], [x20]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x4, #0x98]\n"
+ "smlal v5.4s, v21.4h, v16.4h\n"
+ "smlal2 v23.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v27.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x2, #0, 51f\n"
- "ld1 { v19.b }[4], [x20]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v27.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x2, #1, 50f\n"
- "ld1 { v19.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 51f\n"
- "ld1 { v19.b }[2], [x20]\n"
+ "tbz x1, #1, 50f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v27.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 51f\n"
- "ld1 { v19.b }[0], [x20]\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v27.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d0, [x7, #0x58]\n"
- "ushll v19.8h, v19.8b, #0x0\n"
- "usubl v0.8h, v0.8b, v2.8b\n"
- "ldr x20, [x6, #0xa0]\n"
- "smlal v8.4s, v19.4h, v28.4h\n"
- "smlal2 v17.4s, v19.8h, v28.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v20.4h, v0.4h\n"
- "smlal2 v24.4s, v20.8h, v0.8h\n"
- "smlal v7.4s, v23.4h, v0.4h\n"
- "smlal2 v14.4s, v23.8h, v0.8h\n"
- "smlal v27.4s, v19.4h, v0.4h\n"
- "smlal2 v22.4s, v19.8h, v0.8h\n"
- "tbz x2, #2, 53f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 52f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 55f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ldr d17, [x5, #0x58]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v29.4h, v17.4h\n"
+ "smlal2 v24.4s, v29.8h, v17.8h\n"
+ "smlal v12.4s, v9.4h, v17.4h\n"
+ "smlal2 v13.4s, v9.8h, v17.8h\n"
+ "smlal v5.4s, v27.4h, v17.4h\n"
+ "smlal2 v23.4s, v27.8h, v17.8h\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v0.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x2, #0, 55f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v0.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x2, #1, 54f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 55f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "tbz x1, #1, 54f\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v0.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 55f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v0.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d10, [x7, #0x60]\n"
- "ushll v9.8h, v9.8b, #0x0\n"
- "usubl v10.8h, v10.8b, v2.8b\n"
- "ldr x20, [x6, #0xa8]\n"
- "smlal v8.4s, v9.4h, v0.4h\n"
- "smlal2 v17.4s, v9.8h, v0.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v23.4h, v10.4h\n"
- "smlal2 v24.4s, v23.8h, v10.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "smlal2 v14.4s, v3.8h, v10.8h\n"
- "smlal v27.4s, v9.4h, v10.4h\n"
- "smlal2 v22.4s, v9.8h, v10.8h\n"
- "tbz x2, #2, 57f\n"
- "ld1 { v20.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 56f\n"
- "ld1 { v20.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 59f\n"
- "ld1 { v20.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x60]\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ldr x20, [x4, #0xa8]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v0.4h, v17.4h\n"
+ "smlal2 v19.4s, v0.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v9.4h, v16.4h\n"
+ "smlal2 v24.4s, v9.8h, v16.8h\n"
+ "smlal v12.4s, v6.4h, v16.4h\n"
+ "smlal2 v13.4s, v6.8h, v16.8h\n"
+ "smlal v5.4s, v0.4h, v16.4h\n"
+ "smlal2 v23.4s, v0.8h, v16.8h\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x2, #0, 59f\n"
- "ld1 { v20.b }[4], [x20]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x2, #1, 58f\n"
- "ld1 { v20.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 59f\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "tbz x1, #1, 58f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 59f\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v3.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d28, [x7, #0x68]\n"
- "ushll v20.8h, v20.8b, #0x0\n"
- "usubl v28.8h, v28.8b, v2.8b\n"
- "ldr x20, [x6, #0xb0]\n"
- "smlal v8.4s, v20.4h, v10.4h\n"
- "smlal2 v17.4s, v20.8h, v10.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v3.4h, v28.4h\n"
- "smlal2 v24.4s, v3.8h, v28.8h\n"
- "smlal v7.4s, v6.4h, v28.4h\n"
- "smlal2 v14.4s, v6.8h, v28.8h\n"
- "smlal v27.4s, v20.4h, v28.4h\n"
- "smlal2 v22.4s, v20.8h, v28.8h\n"
- "tbz x2, #2, 61f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 60f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 63f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "ldr d17, [x5, #0x68]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v3.4h, v16.4h\n"
+ "smlal2 v19.4s, v3.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v6.4h, v17.4h\n"
+ "smlal2 v24.4s, v6.8h, v17.8h\n"
+ "smlal v12.4s, v25.4h, v17.4h\n"
+ "smlal2 v13.4s, v25.8h, v17.8h\n"
+ "smlal v5.4s, v3.4h, v17.4h\n"
+ "smlal2 v23.4s, v3.8h, v17.8h\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x2, #0, 63f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x2, #1, 62f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 63f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x1, #1, 62f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 63f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d23, [x7, #0x70]\n"
- "ushll v5.8h, v5.8b, #0x0\n"
- "usubl v23.8h, v23.8b, v2.8b\n"
- "ldr x20, [x6, #0xb8]\n"
- "smlal v8.4s, v5.4h, v28.4h\n"
- "smlal2 v17.4s, v5.8h, v28.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v6.4h, v23.4h\n"
- "smlal2 v24.4s, v6.8h, v23.8h\n"
- "smlal v7.4s, v18.4h, v23.4h\n"
- "smlal2 v14.4s, v18.8h, v23.8h\n"
- "smlal v27.4s, v5.4h, v23.4h\n"
- "smlal2 v22.4s, v5.8h, v23.8h\n"
- "tbz x2, #2, 65f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 64f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 67f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x70]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v26.4h, v17.4h\n"
+ "smlal2 v19.4s, v26.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v25.4h, v16.4h\n"
+ "smlal2 v24.4s, v25.8h, v16.8h\n"
+ "smlal v12.4s, v20.4h, v16.4h\n"
+ "smlal2 v13.4s, v20.8h, v16.8h\n"
+ "smlal v5.4s, v26.4h, v16.4h\n"
+ "smlal2 v23.4s, v26.8h, v16.8h\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x2, #0, 67f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x2, #1, 66f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 67f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "tbz x1, #1, 66f\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 67f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v2.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d4, [x7, #0x78]\n"
- "ushll v29.8h, v29.8b, #0x0\n"
- "usubl v4.8h, v4.8b, v2.8b\n"
- "ldr x20, [x6, #0xc0]\n"
- "smlal v8.4s, v29.4h, v23.4h\n"
- "smlal2 v17.4s, v29.8h, v23.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v30.4h, v4.4h\n"
- "smlal2 v24.4s, v30.8h, v4.8h\n"
- "smlal v7.4s, v19.4h, v4.4h\n"
- "smlal2 v14.4s, v19.8h, v4.8h\n"
- "tbz x2, #2, 69f\n"
+ "ldr d17, [x5, #0x78]\n"
+ "ushll v2.8h, v2.8b, #0x0\n"
+ "ldr x20, [x4, #0xc0]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v2.4h, v16.4h\n"
+ "smlal2 v19.4s, v2.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v21.4h, v17.4h\n"
+ "smlal2 v24.4s, v21.8h, v17.8h\n"
+ "smlal v12.4s, v27.4h, v17.4h\n"
+ "smlal2 v13.4s, v27.8h, v17.8h\n"
+ "tbz x1, #2, 69f\n"
"ld1 { v18.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 68f\n"
+ "tbz x1, #1, 68f\n"
"ld1 { v18.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 71f\n"
+ "tbz x1, #0, 71f\n"
"ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x2, #0, 71f\n"
+ "tbz x1, #0, 71f\n"
"ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x2, #1, 70f\n"
+ "tbz x1, #1, 70f\n"
"ld1 { v18.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 71f\n"
+ "tbz x1, #0, 71f\n"
"ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 71f\n"
+ "tbz x1, #0, 71f\n"
"ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"ushll v18.8h, v18.8b, #0x0\n"
- "ldr x20, [x6, #0xc8]\n"
- "smlal v27.4s, v18.4h, v4.4h\n"
- "smlal2 v22.4s, v18.8h, v4.8h\n"
- "add x20, x20, x4\n"
- "tbz x2, #2, 73f\n"
- "ld1 { v1.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 72f\n"
- "ld1 { v1.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 75f\n"
- "ld1 { v1.b }[6], [x20]\n"
+ "ldr x20, [x4, #0xc8]\n"
+ "smlal v5.4s, v18.4h, v17.4h\n"
+ "smlal2 v23.4s, v18.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v10.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x2, #0, 75f\n"
- "ld1 { v1.b }[4], [x20]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v10.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x2, #1, 74f\n"
- "ld1 { v1.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 75f\n"
- "ld1 { v1.b }[2], [x20]\n"
+ "tbz x1, #1, 74f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v10.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 75f\n"
- "ld1 { v1.b }[0], [x20]\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v10.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d23, [x7, #0x80]\n"
- "ushll v1.8h, v1.8b, #0x0\n"
- "usubl v23.8h, v23.8b, v2.8b\n"
- "ldr x20, [x6, #0xd0]\n"
- "smlal v8.4s, v1.4h, v4.4h\n"
- "smlal2 v17.4s, v1.8h, v4.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v19.4h, v23.4h\n"
- "smlal2 v24.4s, v19.8h, v23.8h\n"
- "smlal v7.4s, v9.4h, v23.4h\n"
- "smlal2 v14.4s, v9.8h, v23.8h\n"
- "smlal v27.4s, v1.4h, v23.4h\n"
- "smlal2 v22.4s, v1.8h, v23.8h\n"
- "tbz x2, #2, 77f\n"
- "ld1 { v4.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 76f\n"
- "ld1 { v4.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 79f\n"
- "ld1 { v4.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x80]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x4, #0xd0]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v10.4h, v17.4h\n"
+ "smlal2 v19.4s, v10.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v27.4h, v16.4h\n"
+ "smlal2 v24.4s, v27.8h, v16.8h\n"
+ "smlal v12.4s, v0.4h, v16.4h\n"
+ "smlal2 v13.4s, v0.8h, v16.8h\n"
+ "smlal v5.4s, v10.4h, v16.4h\n"
+ "smlal2 v23.4s, v10.8h, v16.8h\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v6.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x2, #0, 79f\n"
- "ld1 { v4.b }[4], [x20]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v6.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x2, #1, 78f\n"
- "ld1 { v4.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 79f\n"
- "ld1 { v4.b }[2], [x20]\n"
+ "tbz x1, #1, 78f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v6.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 79f\n"
- "ld1 { v4.b }[0], [x20]\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v6.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d30, [x7, #0x88]\n"
- "ushll v4.8h, v4.8b, #0x0\n"
- "usubl v30.8h, v30.8b, v2.8b\n"
- "ldr x20, [x6, #0xd8]\n"
- "smlal v8.4s, v4.4h, v23.4h\n"
- "smlal2 v17.4s, v4.8h, v23.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v9.4h, v30.4h\n"
- "smlal2 v24.4s, v9.8h, v30.8h\n"
- "smlal v7.4s, v20.4h, v30.4h\n"
- "smlal2 v14.4s, v20.8h, v30.8h\n"
- "smlal v27.4s, v4.4h, v30.4h\n"
- "smlal2 v22.4s, v4.8h, v30.8h\n"
- "tbz x2, #2, 81f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 80f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 83f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ldr d17, [x5, #0x88]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x4, #0xd8]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v19.4s, v6.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v0.4h, v17.4h\n"
+ "smlal2 v24.4s, v0.8h, v17.8h\n"
+ "smlal v12.4s, v3.4h, v17.4h\n"
+ "smlal2 v13.4s, v3.8h, v17.8h\n"
+ "smlal v5.4s, v6.4h, v17.4h\n"
+ "smlal2 v23.4s, v6.8h, v17.8h\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x2, #0, 83f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x2, #1, 82f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 83f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x1, #1, 82f\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 83f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v11.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x7, #0x90]\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "usubl v3.8h, v3.8b, v2.8b\n"
- "ldr x20, [x6, #0xe0]\n"
- "smlal v8.4s, v21.4h, v30.4h\n"
- "smlal2 v17.4s, v21.8h, v30.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v20.4h, v3.4h\n"
- "smlal2 v24.4s, v20.8h, v3.8h\n"
- "smlal v7.4s, v5.4h, v3.4h\n"
- "smlal2 v14.4s, v5.8h, v3.8h\n"
- "smlal v27.4s, v21.4h, v3.4h\n"
- "smlal2 v22.4s, v21.8h, v3.8h\n"
- "tbz x2, #2, 85f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 84f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 87f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ldr d16, [x5, #0x90]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x20, [x4, #0xe0]\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal v7.4s, v11.4h, v17.4h\n"
+ "smlal2 v19.4s, v11.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v3.4h, v16.4h\n"
+ "smlal2 v24.4s, v3.8h, v16.8h\n"
+ "smlal v12.4s, v26.4h, v16.4h\n"
+ "smlal2 v13.4s, v26.8h, v16.8h\n"
+ "smlal v5.4s, v11.4h, v16.4h\n"
+ "smlal2 v23.4s, v11.8h, v16.8h\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x2, #0, 87f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x2, #1, 86f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 87f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "tbz x1, #1, 86f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 87f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d19, [x7, #0x98]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "usubl v19.8h, v19.8b, v2.8b\n"
- "ldr x20, [x6, #0xe8]\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "smlal2 v17.4s, v30.8h, v3.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v5.4h, v19.4h\n"
- "smlal2 v24.4s, v5.8h, v19.8h\n"
- "smlal v7.4s, v29.4h, v19.4h\n"
- "smlal2 v14.4s, v29.8h, v19.8h\n"
- "smlal v27.4s, v30.4h, v19.4h\n"
- "smlal2 v22.4s, v30.8h, v19.8h\n"
- "tbz x2, #2, 89f\n"
- "ld1 { v20.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 88f\n"
- "ld1 { v20.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 91f\n"
- "ld1 { v20.b }[6], [x20]\n"
+ "ldr d17, [x5, #0x98]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x20, [x4, #0xe8]\n"
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "smlal v7.4s, v25.4h, v16.4h\n"
+ "smlal2 v19.4s, v25.8h, v16.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v26.4h, v17.4h\n"
+ "smlal2 v24.4s, v26.8h, v17.8h\n"
+ "smlal v12.4s, v2.4h, v17.4h\n"
+ "smlal2 v13.4s, v2.8h, v17.8h\n"
+ "smlal v5.4s, v25.4h, v17.4h\n"
+ "smlal2 v23.4s, v25.8h, v17.8h\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x2, #0, 91f\n"
- "ld1 { v20.b }[4], [x20]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x2, #1, 90f\n"
- "ld1 { v20.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 91f\n"
- "ld1 { v20.b }[2], [x20]\n"
+ "tbz x1, #1, 90f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 91f\n"
- "ld1 { v20.b }[0], [x20]\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d23, [x7, #0xa0]\n"
- "ushll v20.8h, v20.8b, #0x0\n"
- "usubl v23.8h, v23.8b, v2.8b\n"
- "ldr x20, [x6, #0xf0]\n"
- "smlal v8.4s, v20.4h, v19.4h\n"
- "smlal2 v17.4s, v20.8h, v19.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v18.4h, v23.4h\n"
- "smlal2 v24.4s, v18.8h, v23.8h\n"
- "smlal v7.4s, v1.4h, v23.4h\n"
- "smlal2 v14.4s, v1.8h, v23.8h\n"
- "tbz x2, #2, 93f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 92f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 95f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ldr d4, [x5, #0xa0]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ldr x20, [x4, #0xf0]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v7.4s, v9.4h, v17.4h\n"
+ "smlal2 v19.4s, v9.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v18.4h, v4.4h\n"
+ "smlal2 v24.4s, v18.8h, v4.8h\n"
+ "smlal v12.4s, v10.4h, v4.4h\n"
+ "smlal2 v13.4s, v10.8h, v4.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x2, #0, 95f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x2, #1, 94f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 95f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x1, #1, 94f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 95f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ushll v10.8h, v10.8b, #0x0\n"
- "ldr x20, [x6, #0xf8]\n"
- "smlal v27.4s, v10.4h, v23.4h\n"
- "smlal2 v22.4s, v10.8h, v23.8h\n"
- "add x20, x20, x4\n"
- "tbz x2, #2, 97f\n"
- "ld1 { v18.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 96f\n"
- "ld1 { v18.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 99f\n"
- "ld1 { v18.b }[6], [x20]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x4, #0xf8]\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v23.4s, v16.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x2, #0, 99f\n"
- "ld1 { v18.b }[4], [x20]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x2, #1, 98f\n"
- "ld1 { v18.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 99f\n"
- "ld1 { v18.b }[2], [x20]\n"
+ "tbz x1, #1, 98f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 99f\n"
- "ld1 { v18.b }[0], [x20]\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d5, [x7, #0xa8]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "usubl v5.8h, v5.8b, v2.8b\n"
- "ldr x20, [x6, #0x100]\n"
- "smlal v8.4s, v18.4h, v23.4h\n"
- "smlal2 v17.4s, v18.8h, v23.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v1.4h, v5.4h\n"
- "smlal2 v24.4s, v1.8h, v5.8h\n"
- "smlal v7.4s, v4.4h, v5.4h\n"
- "smlal2 v14.4s, v4.8h, v5.8h\n"
- "smlal v27.4s, v18.4h, v5.4h\n"
- "smlal2 v22.4s, v18.8h, v5.8h\n"
- "tbz x2, #2, 101f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 100f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 103f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ldr d26, [x5, #0xa8]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ldr x20, [x4, #0x100]\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v7.4s, v17.4h, v4.4h\n"
+ "smlal2 v19.4s, v17.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v10.4h, v26.4h\n"
+ "smlal2 v24.4s, v10.8h, v26.8h\n"
+ "smlal v12.4s, v6.4h, v26.4h\n"
+ "smlal2 v13.4s, v6.8h, v26.8h\n"
+ "smlal v5.4s, v17.4h, v26.4h\n"
+ "smlal2 v23.4s, v17.8h, v26.8h\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x2, #0, 103f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x2, #1, 102f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 103f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "tbz x1, #1, 102f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 103f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d18, [x7, #0xb0]\n"
- "ushll v9.8h, v9.8b, #0x0\n"
- "usubl v18.8h, v18.8b, v2.8b\n"
- "ldr x20, [x6, #0x108]\n"
- "smlal v8.4s, v9.4h, v5.4h\n"
- "smlal2 v17.4s, v9.8h, v5.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v21.4h, v18.4h\n"
- "smlal2 v14.4s, v21.8h, v18.8h\n"
- "smlal v27.4s, v9.4h, v18.4h\n"
- "smlal2 v22.4s, v9.8h, v18.8h\n"
- "tbz x2, #2, 105f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 104f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 107f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "ldr d4, [x5, #0xb0]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x4, #0x108]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v7.4s, v16.4h, v26.4h\n"
+ "smlal2 v19.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v6.4h, v4.4h\n"
+ "smlal2 v24.4s, v6.8h, v4.8h\n"
+ "smlal v12.4s, v11.4h, v4.4h\n"
+ "smlal2 v13.4s, v11.8h, v4.8h\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v23.4s, v16.8h, v4.8h\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x2, #0, 107f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x2, #1, 106f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 107f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x1, #1, 106f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 107f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d11, [x7, #0xb8]\n"
- "ushll v5.8h, v5.8b, #0x0\n"
- "usubl v11.8h, v11.8b, v2.8b\n"
- "ldr x20, [x6, #0x110]\n"
- "smlal v8.4s, v5.4h, v18.4h\n"
- "smlal2 v17.4s, v5.8h, v18.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v21.4h, v11.4h\n"
- "smlal2 v24.4s, v21.8h, v11.8h\n"
- "smlal v7.4s, v30.4h, v11.4h\n"
- "smlal2 v14.4s, v30.8h, v11.8h\n"
- "smlal v27.4s, v5.4h, v11.4h\n"
- "smlal2 v22.4s, v5.8h, v11.8h\n"
- "tbz x2, #2, 109f\n"
- "ld1 { v18.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 108f\n"
- "ld1 { v18.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 111f\n"
- "ld1 { v18.b }[6], [x20]\n"
+ "ldr d2, [x5, #0xb8]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ldr x20, [x4, #0x110]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v7.4s, v17.4h, v4.4h\n"
+ "smlal2 v19.4s, v17.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v11.4h, v2.4h\n"
+ "smlal2 v24.4s, v11.8h, v2.8h\n"
+ "smlal v12.4s, v25.4h, v2.4h\n"
+ "smlal2 v13.4s, v25.8h, v2.8h\n"
+ "smlal v5.4s, v17.4h, v2.4h\n"
+ "smlal2 v23.4s, v17.8h, v2.8h\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x2, #0, 111f\n"
- "ld1 { v18.b }[4], [x20]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x2, #1, 110f\n"
- "ld1 { v18.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 111f\n"
- "ld1 { v18.b }[2], [x20]\n"
+ "tbz x1, #1, 110f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 111f\n"
- "ld1 { v18.b }[0], [x20]\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d16, [x7, #0xc0]\n"
- "ushll v18.8h, v18.8b, #0x0\n"
- "usubl v16.8h, v16.8b, v2.8b\n"
- "ldr x20, [x6, #0x118]\n"
- "smlal v8.4s, v18.4h, v11.4h\n"
- "smlal2 v17.4s, v18.8h, v11.8h\n"
- "add x20, x20, x4\n"
- "smlal v13.4s, v30.4h, v16.4h\n"
- "smlal2 v24.4s, v30.8h, v16.8h\n"
- "smlal v7.4s, v20.4h, v16.4h\n"
- "smlal2 v14.4s, v20.8h, v16.8h\n"
- "smlal v27.4s, v18.4h, v16.4h\n"
- "smlal2 v22.4s, v18.8h, v16.8h\n"
- "tbz x2, #2, 113f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x2, #1, 112f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x2, #0, 115f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ldr d4, [x5, #0xc0]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr x20, [x4, #0x118]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v7.4s, v16.4h, v2.4h\n"
+ "smlal2 v19.4s, v16.8h, v2.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v30.4s, v25.4h, v4.4h\n"
+ "smlal2 v24.4s, v25.8h, v4.8h\n"
+ "smlal v12.4s, v9.4h, v4.4h\n"
+ "smlal2 v13.4s, v9.8h, v4.8h\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v23.4s, v16.8h, v4.8h\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x2, #0, 115f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x2, #1, 114f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x2, #0, 115f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x1, #1, 114f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 115f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ushll v21.8h, v21.8b, #0x0\n"
- "smlal v8.4s, v21.4h, v16.4h\n"
- "smlal2 v17.4s, v21.8h, v16.8h\n"
- "tbz x2, #2, 117f\n"
- "ld1 { v16.4s }, [x8], #0x10\n"
- "ld1 { v21.4s }, [x17], #0x10\n"
- "tbz x2, #1, 116f\n"
- "ld1 { v18.d }[0], [x8], #0x8\n"
- "ld1 { v0.d }[0], [x17], #0x8\n"
- "tbz x2, #0, 119f\n"
- "ld1 { v18.s }[2], [x8]\n"
- "ld1 { v0.s }[2], [x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "smlal v7.4s, v17.4h, v4.4h\n"
+ "smlal2 v19.4s, v17.8h, v4.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v16.4s }, [x6], #0x10\n"
+ "ld1 { v27.4s }, [x7], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v11.d }[0], [x6], #0x8\n"
+ "ld1 { v18.d }[0], [x7], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v11.s }[2], [x6]\n"
+ "ld1 { v18.s }[2], [x7]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x2, #0, 119f\n"
- "ld1 { v18.s }[0], [x8]\n"
- "ld1 { v0.s }[0], [x17]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v11.s }[0], [x6]\n"
+ "ld1 { v18.s }[0], [x7]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x2, #1, 118f\n"
- "ld1 { v16.d }[0], [x8], #0x8\n"
- "ld1 { v21.d }[0], [x17], #0x8\n"
- "tbz x2, #0, 119f\n"
- "ld1 { v16.s }[2], [x8]\n"
- "ld1 { v21.s }[2], [x17]\n"
+ "tbz x1, #1, 118f\n"
+ "ld1 { v16.d }[0], [x6], #0x8\n"
+ "ld1 { v27.d }[0], [x7], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v16.s }[2], [x6]\n"
+ "ld1 { v27.s }[2], [x7]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 119f\n"
- "ld1 { v16.s }[0], [x8]\n"
- "ld1 { v21.s }[0], [x17]\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v16.s }[0], [x6]\n"
+ "ld1 { v27.s }[0], [x7]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v16.4s\n"
- "and v5.16b, v13.16b, v21.16b\n"
- "add x16, x16, x5\n"
- "add x15, x15, x5\n"
- "sqrdmulh v24.4s, v24.4s, v18.4s\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "add x14, x14, x5\n"
- "add x13, x13, x5\n"
- "and v2.16b, v24.16b, v0.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v16.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v11.4s\n"
+ "add x17, x17, x3\n"
+ "add x16, x16, x3\n"
+ "sqrdmulh v12.4s, v12.4s, v16.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v16.4s\n"
+ "add x15, x15, x3\n"
+ "add x14, x14, x3\n"
"sqrdmulh v7.4s, v7.4s, v16.4s\n"
- "sqrdmulh v27.4s, v27.4s, v16.4s\n"
- "sqrdmulh v8.4s, v8.4s, v16.4s\n"
- "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v11.4s\n"
+ "and v17.16b, v30.16b, v27.16b\n"
+ "and v16.16b, v24.16b, v18.16b\n"
+ "and v25.16b, v12.16b, v27.16b\n"
+ "and v2.16b, v5.16b, v27.16b\n"
+ "sqrdmulh v23.4s, v23.4s, v11.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v11.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v22.16b, v13.16b, v18.16b\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "and v23.16b, v7.16b, v21.16b\n"
- "sqrdmulh v14.4s, v14.4s, v18.4s\n"
- "and v20.16b, v27.16b, v21.16b\n"
- "sqrdmulh v22.4s, v22.4s, v18.4s\n"
- "and v31.16b, v8.16b, v21.16b\n"
- "sqrdmulh v17.4s, v17.4s, v18.4s\n"
- "sqadd v24.4s, v24.4s, v2.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v18.16b, v14.16b, v0.16b\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v11.16b, v22.16b, v0.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v10.16b, v17.16b, v0.16b\n"
- "sqadd v7.4s, v7.4s, v23.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v27.4s, v27.4s, v20.4s\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "sqadd v8.4s, v8.4s, v31.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v21.4s\n"
- "srshl v7.4s, v7.4s, v21.4s\n"
- "sqadd v14.4s, v14.4s, v18.4s\n"
- "srshl v27.4s, v27.4s, v21.4s\n"
- "sqadd v22.4s, v22.4s, v11.4s\n"
- "srshl v8.4s, v8.4s, v21.4s\n"
- "sqadd v17.4s, v17.4s, v10.4s\n"
- "srshl v24.4s, v24.4s, v0.4s\n"
- "sqxtn v13.4h, v13.4s\n"
- "srshl v14.4s, v14.4s, v0.4s\n"
+ "and v3.16b, v23.16b, v18.16b\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "and v17.16b, v7.16b, v27.16b\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v16.16b, v19.16b, v18.16b\n"
+ "sqadd v12.4s, v12.4s, v25.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v2.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v30.4s, v30.4s, v27.4s\n"
+ "srshl v12.4s, v12.4s, v27.4s\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "sqadd v13.4s, v13.4s, v22.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
+ "sqadd v23.4s, v23.4s, v3.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v18.4s\n"
+ "srshl v7.4s, v7.4s, v27.4s\n"
+ "sqxtn v30.4h, v30.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v23.4s, v23.4s, v18.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v19.4s, v19.4s, v18.4s\n"
"sqxtn v7.4h, v7.4s\n"
- "srshl v22.4s, v22.4s, v0.4s\n"
- "sqxtn v27.4h, v27.4s\n"
- "srshl v17.4s, v17.4s, v0.4s\n"
- "sqxtn v8.4h, v8.4s\n"
- "sqxtn2 v13.8h, v24.4s\n"
- "sqxtn2 v7.8h, v14.4s\n"
- "sqxtn2 v27.8h, v22.4s\n"
- "sqxtn2 v8.8h, v17.4s\n"
- "sqadd v13.8h, v13.8h, v25.8h\n"
- "sqadd v7.8h, v7.8h, v25.8h\n"
- "sqadd v27.8h, v27.8h, v25.8h\n"
- "sqadd v8.8h, v8.8h, v25.8h\n"
- "smax v13.8h, v13.8h, v12.8h\n"
- "smax v7.8h, v7.8h, v12.8h\n"
- "smax v27.8h, v27.8h, v12.8h\n"
- "smax v8.8h, v8.8h, v12.8h\n"
- "smin v13.8h, v13.8h, v26.8h\n"
- "smin v7.8h, v7.8h, v26.8h\n"
- "smin v27.8h, v27.8h, v26.8h\n"
- "smin v8.8h, v8.8h, v26.8h\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "sqxtn2 v30.8h, v24.4s\n"
+ "sqxtn2 v12.8h, v13.4s\n"
+ "sqxtn2 v5.8h, v23.4s\n"
+ "sqxtn2 v7.8h, v19.4s\n"
+ "sqadd v30.8h, v30.8h, v15.8h\n"
+ "sqadd v12.8h, v12.8h, v15.8h\n"
+ "sqadd v5.8h, v5.8h, v15.8h\n"
+ "sqadd v7.8h, v7.8h, v15.8h\n"
+ "smax v30.8h, v30.8h, v31.8h\n"
+ "smax v12.8h, v12.8h, v31.8h\n"
+ "smax v5.8h, v5.8h, v31.8h\n"
+ "smax v7.8h, v7.8h, v31.8h\n"
+ "smin v30.8h, v30.8h, v28.8h\n"
+ "smin v12.8h, v12.8h, v28.8h\n"
+ "smin v5.8h, v5.8h, v28.8h\n"
+ "smin v7.8h, v7.8h, v28.8h\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v27.16b, v27.16b, v27.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "tbz x2, #2, 121f\n"
- "st1 { v13.s }[0], [x16], #0x4\n"
- "st1 { v7.s }[0], [x15], #0x4\n"
- "st1 { v27.s }[0], [x14], #0x4\n"
- "st1 { v8.s }[0], [x13], #0x4\n"
- "tbz x2, #1, 120f\n"
- "st1 { v13.h }[2], [x16], #0x2\n"
- "st1 { v7.h }[2], [x15], #0x2\n"
- "st1 { v27.h }[2], [x14], #0x2\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "tbz x2, #0, 123f\n"
- "st1 { v13.b }[6], [x16], #0x1\n"
- "st1 { v7.b }[6], [x15], #0x1\n"
- "st1 { v27.b }[6], [x14], #0x1\n"
- "st1 { v8.b }[6], [x13], #0x1\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v30.s }[0], [x17], #0x4\n"
+ "st1 { v12.s }[0], [x16], #0x4\n"
+ "st1 { v5.s }[0], [x15], #0x4\n"
+ "st1 { v7.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v30.h }[2], [x17], #0x2\n"
+ "st1 { v12.h }[2], [x16], #0x2\n"
+ "st1 { v5.h }[2], [x15], #0x2\n"
+ "st1 { v7.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v30.b }[6], [x17], #0x1\n"
+ "st1 { v12.b }[6], [x16], #0x1\n"
+ "st1 { v5.b }[6], [x15], #0x1\n"
+ "st1 { v7.b }[6], [x14], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x2, #0, 123f\n"
- "st1 { v13.b }[4], [x16], #0x1\n"
- "st1 { v7.b }[4], [x15], #0x1\n"
- "st1 { v27.b }[4], [x14], #0x1\n"
- "st1 { v8.b }[4], [x13], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v30.b }[4], [x17], #0x1\n"
+ "st1 { v12.b }[4], [x16], #0x1\n"
+ "st1 { v5.b }[4], [x15], #0x1\n"
+ "st1 { v7.b }[4], [x14], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x2, #1, 122f\n"
- "st1 { v13.h }[0], [x16], #0x2\n"
- "st1 { v7.h }[0], [x15], #0x2\n"
- "st1 { v27.h }[0], [x14], #0x2\n"
- "st1 { v8.h }[0], [x13], #0x2\n"
- "tbz x2, #0, 123f\n"
- "st1 { v13.b }[2], [x16], #0x1\n"
- "st1 { v7.b }[2], [x15], #0x1\n"
- "st1 { v27.b }[2], [x14], #0x1\n"
- "st1 { v8.b }[2], [x13], #0x1\n"
+ "tbz x1, #1, 122f\n"
+ "st1 { v30.h }[0], [x17], #0x2\n"
+ "st1 { v12.h }[0], [x16], #0x2\n"
+ "st1 { v5.h }[0], [x15], #0x2\n"
+ "st1 { v7.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v30.b }[2], [x17], #0x1\n"
+ "st1 { v12.b }[2], [x16], #0x1\n"
+ "st1 { v5.b }[2], [x15], #0x1\n"
+ "st1 { v7.b }[2], [x14], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x2, #0, 123f\n"
- "st1 { v13.b }[0], [x16], #0x1\n"
- "st1 { v7.b }[0], [x15], #0x1\n"
- "st1 { v27.b }[0], [x14], #0x1\n"
- "st1 { v8.b }[0], [x13], #0x1\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v30.b }[0], [x17], #0x1\n"
+ "st1 { v12.b }[0], [x16], #0x1\n"
+ "st1 { v5.b }[0], [x15], #0x1\n"
+ "st1 { v7.b }[0], [x14], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index f1c1b2315c..7e1e00abcc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,1072 +91,1072 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v14.16b }, [x20]\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v19.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_minval]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v29.8h }, [x21]\n"
- "ld1r { v12.8h }, [x20]\n"
"mov x17, #0x0\n"
"mov x16, #0x0\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
"add x15, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
"ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "lsr x11, x8, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v11.16b }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v16.16b }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x11, 3f\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
"add x20, x20, #0x20\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x23, x22, [x15, #0x0]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d22, [x23, x17]\n"
- "ldr d4, [x22, x17]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d8, [x21, x17]\n"
- "ldr d27, [x20, x17]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
"ldr x20, [x15, #0x20]\n"
- "ldr d15, [x20, x17]\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q3, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q28, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "ldr x27, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "ldr x26, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr q17, [x13, #0x0]\n"
+ "ldr q26, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q28, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x24, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x23, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x22, [x15, #0x30]\n"
+ "ldr x21, [x15, #0x40]\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "ldr x26, [x15, #0x48]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr d21, [x24, x17]\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x21, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x25, [x15, #0x58]\n"
"ldr x24, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x23, [x15, #0x68]\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x22, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x21, [x15, #0x78]\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x20, x17]\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x27, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x26, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
"add x14, x14, #0x48\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "subs x11, x11, #0x1\n"
+ "smlal v31.4s, v19.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"add x13, x13, #0x20\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x25, x17]\n"
"add x12, x12, #0x20\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x25, x17]\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x24, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x23, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x22, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "usubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d8, [x21, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v2.4s, v18.4h, v4.4h\n"
+ "smlal2 v1.4s, v18.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v9.8h\n"
+ "ldr d19, [x24, x17]\n"
+ "smlal v8.4s, v18.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v18.8h, v15.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x21, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v3.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v27.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v27.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v28.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v8.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v3.4s\n"
- "smlal v10.4s, v8.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v3.4s\n"
- "smlal2 v30.4s, v8.8h, v20.8h\n"
- "smlal2 v6.4s, v8.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v3.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v20.16b, v0.16b, v28.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v31.16b, v30.16b, v28.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v18.16b, v6.16b, v28.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "usubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "usubl v4.8h, v4.8b, v11.8b\n"
+ "smlal v31.4s, v18.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v19.4h, v3.4h\n"
+ "smlal2 v24.4s, v19.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v18.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v19.4h, v6.4h\n"
+ "smlal2 v1.4s, v19.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v18.4h, v9.4h\n"
+ "smlal v31.4s, v4.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v9.8h\n"
+ "smlal v0.4s, v4.4h, v9.4h\n"
+ "smlal2 v24.4s, v4.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v28.4s\n"
+ "smlal2 v27.4s, v4.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "and v18.16b, v2.16b, v26.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v28.4s\n"
+ "and v4.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v26.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v26.16b\n"
+ "sqadd v2.4s, v2.4s, v18.4s\n"
+ "and v19.16b, v31.16b, v26.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v4.4s\n"
"sshr v20.4s, v20.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v3.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v3.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
"sqadd v0.4s, v0.4s, v20.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v31.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v18.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v28.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v28.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "srshl v8.4s, v8.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v26.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v27.4s, v27.4s, v3.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v28.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
- "ldr q9, [x20, #0x0]\n"
- "ldr q24, [x20, #0x10]\n"
- "add x20, x20, #0x20\n"
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x23, x22, [x15, #0x0]\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
"ldp x21, x20, [x15, #0x10]\n"
- "ldr d22, [x23, x17]\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
- "ldr d4, [x22, x17]\n"
- "ldr d8, [x21, x17]\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ldr d27, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
+ "ldr d19, [x23, x17]\n"
+ "ldr d21, [x22, x17]\n"
+ "ldr d29, [x21, x17]\n"
+ "ldr d22, [x20, x17]\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"ldr x20, [x15, #0x20]\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ldr d15, [x20, x17]\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "ldr d20, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q28, [x13, #0x0]\n"
- "ldr q17, [x12, #0x0]\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
- "ldr q21, [x13, #0x10]\n"
- "ldr q3, [x12, #0x10]\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "ldr x20, [x15, #0x28]\n"
- "ldr d11, [x20, x17]\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "ldr x20, [x15, #0x38]\n"
- "ldr d4, [x20, x17]\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
- "ldr x20, [x15, #0x30]\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr q26, [x13, #0x0]\n"
+ "ldr q28, [x12, #0x0]\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "ldr q17, [x13, #0x10]\n"
+ "ldr q23, [x12, #0x10]\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "ldr x23, [x15, #0x28]\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "ldr x22, [x15, #0x38]\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "ldr x21, [x15, #0x30]\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
"ldr x26, [x15, #0x48]\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
"ldr x25, [x15, #0x50]\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "ldr d8, [x20, x17]\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr d21, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "ldr d29, [x20, x17]\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
"ldr x24, [x15, #0x58]\n"
"ldr x23, [x15, #0x60]\n"
- "smlal v2.4s, v11.4h, v31.4h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x22, [x15, #0x68]\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
"ldr x21, [x15, #0x70]\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
- "smlal v9.4s, v4.4h, v16.4h\n"
"ldr x20, [x15, #0x78]\n"
- "tst x7, #0x7\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "ldr d27, [x26, x17]\n"
- "smlal2 v30.4s, v11.8h, v31.8h\n"
- "ldr d11, [x25, x17]\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "ldr d22, [x26, x17]\n"
+ "smlal v0.4s, v21.4h, v6.4h\n"
+ "smlal2 v24.4s, v21.8h, v6.8h\n"
+ "ldr d21, [x25, x17]\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "tst x8, #0x7\n"
"add x13, x13, #0x20\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal v10.4s, v22.4h, v20.4h\n"
- "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
"add x12, x12, #0x20\n"
- "smlal2 v24.4s, v4.8h, v16.8h\n"
- "smlal v9.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "ldr d15, [x24, x17]\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal2 v6.4s, v22.8h, v20.8h\n"
- "ldr d22, [x23, x17]\n"
- "smlal v7.4s, v4.4h, v23.4h\n"
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v2.4s, v27.4h, v18.4h\n"
- "smlal v10.4s, v27.4h, v26.4h\n"
- "smlal2 v24.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v20.4h\n"
- "smlal2 v0.4s, v4.8h, v23.8h\n"
- "ldr d4, [x22, x17]\n"
- "smlal2 v30.4s, v27.8h, v18.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "smlal2 v6.4s, v27.8h, v26.8h\n"
- "ldr d26, [x21, x17]\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "usubl v26.8h, v26.8b, v14.8b\n"
- "smlal v2.4s, v11.4h, v23.4h\n"
- "smlal v10.4s, v15.4h, v1.4h\n"
- "smlal2 v24.4s, v27.8h, v20.8h\n"
- "smlal v9.4s, v11.4h, v5.4h\n"
- "smlal2 v0.4s, v8.8h, v16.8h\n"
- "ldr d16, [x20, x17]\n"
- "smlal2 v30.4s, v11.8h, v23.8h\n"
- "usubl v16.8h, v16.8b, v14.8b\n"
- "smlal2 v6.4s, v15.8h, v1.8h\n"
- "smlal v7.4s, v27.4h, v25.4h\n"
+ "smlal v31.4s, v18.4h, v9.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "ldr d20, [x24, x17]\n"
+ "smlal v2.4s, v19.4h, v4.4h\n"
+ "smlal2 v1.4s, v19.8h, v4.8h\n"
+ "smlal2 v27.4s, v18.8h, v9.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal v8.4s, v19.4h, v15.4h\n"
+ "smlal v31.4s, v22.4h, v25.4h\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal2 v30.4s, v19.8h, v15.8h\n"
+ "ldr d19, [x22, x17]\n"
+ "smlal v0.4s, v22.4h, v10.4h\n"
+ "smlal2 v24.4s, v22.8h, v10.8h\n"
+ "smlal v2.4s, v29.4h, v5.4h\n"
+ "smlal2 v1.4s, v29.8h, v5.8h\n"
+ "usubl v18.8h, v18.8b, v11.8b\n"
+ "smlal2 v27.4s, v22.8h, v25.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "smlal v8.4s, v29.4h, v4.4h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "smlal v31.4s, v20.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x20, x17]\n"
"add x17, x17, #0x8\n"
- "smlal v2.4s, v22.4h, v5.4h\n"
- "smlal v10.4s, v4.4h, v18.4h\n"
- "smlal2 v24.4s, v11.8h, v5.8h\n"
- "smlal v9.4s, v22.4h, v31.4h\n"
- "sqrdmulh v9.4s, v9.4s, v28.4s\n"
- "smlal2 v0.4s, v27.8h, v25.8h\n"
- "smlal2 v30.4s, v22.8h, v5.8h\n"
- "and v1.16b, v9.16b, v17.16b\n"
- "smlal2 v6.4s, v4.8h, v18.8h\n"
- "smlal v7.4s, v15.4h, v18.4h\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smlal v2.4s, v26.4h, v25.4h\n"
- "smlal v10.4s, v26.4h, v31.4h\n"
- "sqadd v9.4s, v9.4s, v1.4s\n"
- "smlal2 v24.4s, v22.8h, v31.8h\n"
- "smlal2 v0.4s, v15.8h, v18.8h\n"
- "sqrdmulh v24.4s, v24.4s, v21.4s\n"
- "smlal2 v30.4s, v26.8h, v25.8h\n"
- "smlal2 v6.4s, v26.8h, v31.8h\n"
- "and v31.16b, v24.16b, v3.16b\n"
- "smlal v7.4s, v4.4h, v20.4h\n"
- "smlal v2.4s, v16.4h, v20.4h\n"
- "sqrdmulh v7.4s, v7.4s, v28.4s\n"
- "smlal v10.4s, v16.4h, v25.4h\n"
- "smlal2 v0.4s, v4.8h, v20.8h\n"
- "sqrdmulh v2.4s, v2.4s, v28.4s\n"
- "smlal2 v30.4s, v16.8h, v20.8h\n"
- "smlal2 v6.4s, v16.8h, v25.8h\n"
- "sqrdmulh v10.4s, v10.4s, v28.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v22.16b, v7.16b, v17.16b\n"
- "sqrdmulh v0.4s, v0.4s, v21.4s\n"
- "and v15.16b, v2.16b, v17.16b\n"
- "sqrdmulh v30.4s, v30.4s, v21.4s\n"
- "and v11.16b, v10.16b, v17.16b\n"
- "sqrdmulh v6.4s, v6.4s, v21.4s\n"
- "sqadd v24.4s, v24.4s, v31.4s\n"
+ "smlal v0.4s, v21.4h, v15.4h\n"
+ "smlal2 v24.4s, v21.8h, v15.8h\n"
+ "smlal v2.4s, v22.4h, v9.4h\n"
+ "smlal2 v1.4s, v22.8h, v9.8h\n"
+ "usubl v25.8h, v25.8b, v11.8b\n"
+ "smlal2 v27.4s, v20.8h, v5.8h\n"
+ "smlal v8.4s, v22.4h, v7.4h\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "smlal v31.4s, v19.4h, v10.4h\n"
+ "smlal2 v30.4s, v22.8h, v7.8h\n"
+ "smlal v0.4s, v18.4h, v3.4h\n"
+ "smlal2 v24.4s, v18.8h, v3.8h\n"
+ "smlal v2.4s, v21.4h, v3.4h\n"
+ "smlal2 v1.4s, v21.8h, v3.8h\n"
+ "smlal2 v27.4s, v19.8h, v10.8h\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal v31.4s, v25.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v10.8h\n"
+ "smlal v0.4s, v25.4h, v7.4h\n"
+ "smlal2 v24.4s, v25.8h, v7.8h\n"
+ "smlal v2.4s, v18.4h, v6.4h\n"
+ "smlal2 v1.4s, v18.8h, v6.8h\n"
+ "smlal2 v27.4s, v25.8h, v6.8h\n"
+ "smlal v8.4s, v19.4h, v9.4h\n"
+ "smlal v31.4s, v29.4h, v7.4h\n"
+ "smlal2 v30.4s, v19.8h, v9.8h\n"
+ "smlal v0.4s, v29.4h, v9.4h\n"
+ "smlal2 v24.4s, v29.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v26.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+ "smlal2 v27.4s, v29.8h, v7.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v26.4s\n"
+ "and v25.16b, v2.16b, v28.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v26.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "and v22.16b, v1.16b, v23.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v21.16b, v8.16b, v28.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v17.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v17.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "and v18.16b, v0.16b, v3.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v23.16b, v30.16b, v3.16b\n"
- "sshr v11.4s, v11.4s, #0x1f\n"
- "and v21.16b, v6.16b, v3.16b\n"
- "sqadd v7.4s, v7.4s, v22.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v11.4s\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sqadd v2.4s, v2.4s, v25.4s\n"
+ "and v19.16b, v31.16b, v28.16b\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "sqadd v30.4s, v30.4s, v23.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "sqadd v6.4s, v6.4s, v21.4s\n"
- "srshl v24.4s, v24.4s, v3.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
+ "and v10.16b, v30.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v17.16b, v24.16b, v23.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v22.16b, v27.16b, v23.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v31.4s, v31.4s, v19.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v28.4s\n"
+ "srshl v8.4s, v8.4s, v28.4s\n"
+ "sqadd v30.4s, v30.4s, v10.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v28.4s\n"
+ "sqadd v27.4s, v27.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v3.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "str d9, [x11, x16]\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v23.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v23.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v23.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "str d7, [x10, x16]\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d2, [x9, x16]\n"
- "str d10, [x28, x16]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str d2, [x10, x16]\n"
+ "str d8, [x9, x16]\n"
+ "str d0, [x28, x16]\n"
+ "str d31, [x27, x16]\n"
"add x16, x16, #0x8\n"
"beq 64f\n"
"add x14, x14, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v9.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v24.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[2], [x20]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v2.4s }, [x20], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v24.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v9.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[2], [x20]\n"
+ "tbz x8, #1, 6f\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v9.s }[0], [x20]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v2.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d23, [x14, #0x0]\n"
- "ldr d16, [x14, #0x8]\n"
- "mov v7.16b, v9.16b\n"
- "mov v0.16b, v24.16b\n"
- "ldr d1, [x14, #0x10]\n"
- "ldr d5, [x14, #0x18]\n"
- "mov v2.16b, v9.16b\n"
- "mov v30.16b, v24.16b\n"
- "ldr d26, [x14, #0x20]\n"
- "ldr d18, [x14, #0x28]\n"
- "mov v10.16b, v9.16b\n"
- "mov v6.16b, v24.16b\n"
- "ldr d31, [x14, #0x30]\n"
- "ldr d25, [x14, #0x38]\n"
- "ssubl v23.8h, v23.8b, v19.8b\n"
- "ssubl v16.8h, v16.8b, v19.8b\n"
- "ldr d20, [x14, #0x40]\n"
+ "ldr d15, [x14, #0x0]\n"
+ "ldr d4, [x14, #0x8]\n"
+ "mov v8.16b, v2.16b\n"
+ "mov v30.16b, v1.16b\n"
+ "ldr d5, [x14, #0x10]\n"
+ "ldr d3, [x14, #0x18]\n"
+ "mov v0.16b, v2.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d25, [x14, #0x20]\n"
+ "ldr d10, [x14, #0x28]\n"
+ "mov v31.16b, v2.16b\n"
+ "mov v27.16b, v1.16b\n"
+ "ldr d6, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v15.8h, v15.8b, v16.8b\n"
+ "ssubl v4.8h, v4.8b, v16.8b\n"
+ "ldr d9, [x14, #0x40]\n"
"ldp x24, x23, [x15, #0x0]\n"
- "ssubl v1.8h, v1.8b, v19.8b\n"
- "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v16.8b\n"
+ "ssubl v3.8h, v3.8b, v16.8b\n"
+ "ssubl v25.8h, v25.8b, v16.8b\n"
+ "ssubl v10.8h, v10.8b, v16.8b\n"
+ "ssubl v6.8h, v6.8b, v16.8b\n"
+ "ssubl v7.8h, v7.8b, v16.8b\n"
"ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
- "ssubl v26.8h, v26.8b, v19.8b\n"
- "ssubl v18.8h, v18.8b, v19.8b\n"
- "ssubl v31.8h, v31.8b, v19.8b\n"
- "ssubl v25.8h, v25.8b, v19.8b\n"
- "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ssubl v9.8h, v9.8b, v16.8b\n"
"add x24, x24, x17\n"
"add x23, x23, x17\n"
+ "ldr x20, [x15, #0x20]\n"
"add x22, x22, x17\n"
"add x21, x21, x17\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v22.s }[0], [x24], #0x4\n"
- "ld1 { v4.s }[0], [x23], #0x4\n"
- "ld1 { v8.s }[0], [x22], #0x4\n"
- "ld1 { v27.s }[0], [x21], #0x4\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v22.h }[2], [x24], #0x2\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v8.h }[2], [x22], #0x2\n"
- "ld1 { v27.h }[2], [x21], #0x2\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[6], [x24]\n"
- "ld1 { v4.b }[6], [x23]\n"
- "ld1 { v8.b }[6], [x22]\n"
- "ld1 { v27.b }[6], [x21]\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v21.s }[0], [x23], #0x4\n"
+ "ld1 { v29.s }[0], [x22], #0x4\n"
+ "ld1 { v22.s }[0], [x21], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v21.h }[2], [x23], #0x2\n"
+ "ld1 { v29.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v21.b }[6], [x23]\n"
+ "ld1 { v29.b }[6], [x22]\n"
+ "ld1 { v22.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[4], [x24]\n"
- "ld1 { v4.b }[4], [x23]\n"
- "ld1 { v8.b }[4], [x22]\n"
- "ld1 { v27.b }[4], [x21]\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v21.b }[4], [x23]\n"
+ "ld1 { v29.b }[4], [x22]\n"
+ "ld1 { v22.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v22.h }[0], [x24], #0x2\n"
- "ld1 { v4.h }[0], [x23], #0x2\n"
- "ld1 { v8.h }[0], [x22], #0x2\n"
- "ld1 { v27.h }[0], [x21], #0x2\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x23]\n"
- "ld1 { v8.b }[2], [x22]\n"
- "ld1 { v27.b }[2], [x21]\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x8, #1, 10f\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v21.h }[0], [x23], #0x2\n"
+ "ld1 { v29.h }[0], [x22], #0x2\n"
+ "ld1 { v22.h }[0], [x21], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v21.b }[2], [x23]\n"
+ "ld1 { v29.b }[2], [x22]\n"
+ "ld1 { v22.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v22.b }[0], [x24]\n"
- "ld1 { v4.b }[0], [x23]\n"
- "ld1 { v8.b }[0], [x22]\n"
- "ld1 { v27.b }[0], [x21]\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v21.b }[0], [x23]\n"
+ "ld1 { v29.b }[0], [x22]\n"
+ "ld1 { v22.b }[0], [x21]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v22.8h, v22.8b, v14.8b\n"
- "smlal v9.4s, v22.4h, v26.4h\n"
- "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "usubl v19.8h, v19.8b, v11.8b\n"
+ "usubl v21.8h, v21.8b, v11.8b\n"
"ldr x20, [x15, #0x28]\n"
- "smlal v7.4s, v22.4h, v5.4h\n"
- "smlal2 v0.4s, v22.8h, v5.8h\n"
- "usubl v4.8h, v4.8b, v14.8b\n"
- "usubl v8.8h, v8.8b, v14.8b\n"
- "smlal v2.4s, v22.4h, v16.4h\n"
- "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "usubl v29.8h, v29.8b, v11.8b\n"
+ "usubl v22.8h, v22.8b, v11.8b\n"
+ "usubl v20.8h, v20.8b, v11.8b\n"
+ "smlal v2.4s, v19.4h, v25.4h\n"
+ "smlal2 v1.4s, v19.8h, v25.8h\n"
+ "smlal v8.4s, v19.4h, v3.4h\n"
+ "smlal2 v30.4s, v19.8h, v3.8h\n"
"add x20, x20, x17\n"
- "smlal v10.4s, v22.4h, v23.4h\n"
- "smlal2 v6.4s, v22.8h, v23.8h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "smlal v9.4s, v4.4h, v23.4h\n"
- "smlal2 v24.4s, v4.8h, v23.8h\n"
- "usubl v15.8h, v15.8b, v14.8b\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v0.4s, v8.8h, v1.8h\n"
- "smlal v9.4s, v27.4h, v18.4h\n"
- "smlal2 v24.4s, v27.8h, v18.8h\n"
- "smlal v7.4s, v27.4h, v26.4h\n"
- "smlal2 v0.4s, v27.8h, v26.8h\n"
- "smlal v2.4s, v27.4h, v1.4h\n"
- "smlal2 v30.4s, v27.8h, v1.8h\n"
- "smlal v10.4s, v27.4h, v16.4h\n"
- "smlal2 v6.4s, v27.8h, v16.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "smlal v0.4s, v19.4h, v4.4h\n"
+ "smlal2 v24.4s, v19.8h, v4.8h\n"
+ "smlal v31.4s, v19.4h, v15.4h\n"
+ "smlal2 v27.4s, v19.8h, v15.8h\n"
+ "smlal v2.4s, v21.4h, v15.4h\n"
+ "smlal2 v1.4s, v21.8h, v15.8h\n"
+ "smlal v8.4s, v29.4h, v5.4h\n"
+ "smlal2 v30.4s, v29.8h, v5.8h\n"
+ "smlal v0.4s, v22.4h, v5.4h\n"
+ "smlal2 v24.4s, v22.8h, v5.8h\n"
+ "smlal v31.4s, v22.4h, v4.4h\n"
+ "smlal2 v27.4s, v22.8h, v4.8h\n"
+ "smlal v2.4s, v22.4h, v10.4h\n"
+ "smlal2 v1.4s, v22.8h, v10.8h\n"
+ "smlal v8.4s, v22.4h, v25.4h\n"
+ "smlal2 v30.4s, v22.8h, v25.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v21.8h, v21.8b, v14.8b\n"
- "smlal v2.4s, v21.4h, v31.4h\n"
- "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x30]\n"
- "smlal v9.4s, v15.4h, v25.4h\n"
- "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v2.4s, v20.4h, v7.4h\n"
+ "smlal2 v1.4s, v20.8h, v7.8h\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v30.4s, v20.8h, v6.8h\n"
+ "smlal v31.4s, v20.4h, v3.4h\n"
+ "smlal2 v27.4s, v20.8h, v3.8h\n"
+ "smlal v0.4s, v17.4h, v6.4h\n"
+ "smlal2 v24.4s, v17.8h, v6.8h\n"
"add x20, x20, x17\n"
- "smlal v7.4s, v15.4h, v31.4h\n"
- "smlal2 v0.4s, v15.8h, v31.8h\n"
- "smlal v2.4s, v15.4h, v26.4h\n"
- "smlal2 v30.4s, v15.8h, v26.8h\n"
- "smlal v10.4s, v15.4h, v5.4h\n"
- "smlal2 v6.4s, v15.8h, v5.8h\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "smlal v0.4s, v20.4h, v25.4h\n"
+ "smlal2 v24.4s, v20.8h, v25.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x38]\n"
- "smlal v10.4s, v28.4h, v20.4h\n"
- "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "smlal v31.4s, v16.4h, v9.4h\n"
+ "smlal2 v27.4s, v16.8h, v9.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v22.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[4], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 1): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v22.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "tbz x8, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v22.b }[0], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x40]\n"
- "smlal v9.4s, v22.4h, v16.4h\n"
- "smlal2 v24.4s, v22.8h, v16.8h\n"
- "smlal v7.4s, v22.4h, v23.4h\n"
- "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "smlal v2.4s, v17.4h, v4.4h\n"
+ "smlal2 v1.4s, v17.8h, v4.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v30.4s, v17.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (0, 2): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "tbz x8, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v21.8h, v21.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x48]\n"
- "smlal v9.4s, v21.4h, v1.4h\n"
- "smlal2 v24.4s, v21.8h, v1.8h\n"
- "smlal v7.4s, v21.4h, v16.4h\n"
- "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "smlal v2.4s, v16.4h, v5.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x50]\n"
- "smlal v9.4s, v28.4h, v20.4h\n"
- "smlal2 v24.4s, v28.8h, v20.8h\n"
- "smlal v7.4s, v28.4h, v25.4h\n"
- "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "smlal v2.4s, v17.4h, v9.4h\n"
+ "smlal2 v1.4s, v17.8h, v9.8h\n"
+ "smlal v8.4s, v17.4h, v7.4h\n"
+ "smlal2 v30.4s, v17.8h, v7.8h\n"
+ "smlal v0.4s, v17.4h, v10.4h\n"
+ "smlal2 v24.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v25.4h\n"
"add x20, x20, x17\n"
- "smlal v2.4s, v28.4h, v18.4h\n"
- "smlal2 v30.4s, v28.8h, v18.8h\n"
- "smlal v10.4s, v28.4h, v26.4h\n"
- "smlal2 v6.4s, v28.8h, v26.8h\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "smlal2 v27.4s, v17.8h, v25.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (1, 0): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x58]\n"
- "smlal v9.4s, v8.4h, v5.4h\n"
- "smlal2 v24.4s, v8.8h, v5.8h\n"
- "smlal v2.4s, v8.4h, v23.4h\n"
- "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "smlal v2.4s, v16.4h, v3.4h\n"
+ "smlal2 v1.4s, v16.8h, v3.8h\n"
+ "smlal v0.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x60]\n"
- "smlal v7.4s, v8.4h, v18.4h\n"
- "smlal2 v0.4s, v8.8h, v18.8h\n"
- "smlal v10.4s, v8.4h, v1.4h\n"
- "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "smlal v8.4s, v17.4h, v10.4h\n"
+ "smlal2 v30.4s, v17.8h, v10.8h\n"
+ "smlal v31.4s, v17.4h, v5.4h\n"
+ "smlal2 v27.4s, v17.8h, v5.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
- "ld1 { v17.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
- "ld1 { v17.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[6], [x20]\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[4], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 42f\n"
- "ld1 { v17.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[2], [x20]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
- "ld1 { v17.b }[0], [x20]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v17.8h, v17.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x68]\n"
- "smlal v9.4s, v17.4h, v31.4h\n"
- "smlal2 v24.4s, v17.8h, v31.8h\n"
- "smlal v2.4s, v17.4h, v5.4h\n"
- "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "smlal v2.4s, v16.4h, v6.4h\n"
+ "smlal2 v1.4s, v16.8h, v6.8h\n"
+ "smlal v0.4s, v16.4h, v3.4h\n"
+ "smlal2 v24.4s, v16.8h, v3.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v23.8h, v23.8b, v14.8b\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
"ldr x20, [x15, #0x70]\n"
- "smlal v7.4s, v23.4h, v20.4h\n"
- "smlal2 v0.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v18.4h\n"
- "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v30.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v10.4h\n"
+ "smlal2 v27.4s, v17.8h, v10.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
- "ld1 { v5.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
- "ld1 { v5.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[6], [x20]\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[4], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 50f\n"
- "ld1 { v5.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[2], [x20]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
- "ld1 { v5.b }[0], [x20]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v5.8h, v5.8b, v14.8b\n"
+ "usubl v16.8h, v16.8b, v11.8b\n"
"ldr x20, [x15, #0x78]\n"
- "smlal v2.4s, v5.4h, v25.4h\n"
- "smlal2 v30.4s, v5.8h, v25.8h\n"
- "smlal v10.4s, v5.4h, v31.4h\n"
- "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "smlal v0.4s, v16.4h, v7.4h\n"
+ "smlal2 v24.4s, v16.8h, v7.8h\n"
+ "smlal v31.4s, v16.4h, v6.4h\n"
+ "smlal2 v27.4s, v16.8h, v6.8h\n"
"add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v23.8h, v23.8b, v14.8b\n"
- "smlal v2.4s, v23.4h, v20.4h\n"
- "smlal2 v30.4s, v23.8h, v20.8h\n"
- "smlal v10.4s, v23.4h, v25.4h\n"
- "smlal2 v6.4s, v23.8h, v25.8h\n"
- "tbz x7, #2, 57f\n"
- "ld1 { v15.4s }, [x13], #0x10\n"
- "ld1 { v19.4s }, [x12], #0x10\n"
- "tbz x7, #1, 56f\n"
+ "usubl v17.8h, v17.8b, v11.8b\n"
+ "smlal v0.4s, v17.4h, v9.4h\n"
+ "smlal2 v24.4s, v17.8h, v9.8h\n"
+ "smlal v31.4s, v17.4h, v7.4h\n"
+ "smlal2 v27.4s, v17.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v16.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v18.d }[0], [x13], #0x8\n"
"ld1 { v22.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[2], [x13]\n"
"ld1 { v22.s }[2], [x12]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v18.s }[0], [x13]\n"
"ld1 { v22.s }[0], [x12]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 58f\n"
- "ld1 { v15.d }[0], [x13], #0x8\n"
- "ld1 { v19.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[2], [x13]\n"
- "ld1 { v19.s }[2], [x12]\n"
+ "tbz x8, #1, 58f\n"
+ "ld1 { v16.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
- "ld1 { v15.s }[0], [x13]\n"
- "ld1 { v19.s }[0], [x12]\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v16.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v9.4s, v9.4s, v15.4s\n"
- "and v17.16b, v9.16b, v19.16b\n"
- "add x11, x11, x16\n"
+ "sqrdmulh v2.4s, v2.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
"add x10, x10, x16\n"
- "sqrdmulh v24.4s, v24.4s, v18.4s\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
"add x9, x9, x16\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v16.4s\n"
"add x28, x28, x16\n"
- "and v20.16b, v24.16b, v22.16b\n"
- "sqrdmulh v7.4s, v7.4s, v15.4s\n"
- "sqrdmulh v2.4s, v2.4s, v15.4s\n"
- "sqrdmulh v10.4s, v10.4s, v15.4s\n"
- "sqadd v9.4s, v9.4s, v17.4s\n"
- "sshr v20.4s, v20.4s, #0x1f\n"
- "and v21.16b, v7.16b, v19.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v15.16b, v2.16b, v19.16b\n"
+ "add x27, x27, x16\n"
+ "sqrdmulh v31.4s, v31.4s, v16.4s\n"
"sqrdmulh v30.4s, v30.4s, v18.4s\n"
- "and v23.16b, v10.16b, v19.16b\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqadd v24.4s, v24.4s, v20.4s\n"
+ "and v17.16b, v2.16b, v23.16b\n"
+ "and v16.16b, v1.16b, v22.16b\n"
+ "and v21.16b, v8.16b, v23.16b\n"
+ "and v20.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sshr v21.4s, v21.4s, #0x1f\n"
- "and v18.16b, v0.16b, v22.16b\n"
- "sshr v15.4s, v15.4s, #0x1f\n"
- "and v17.16b, v30.16b, v22.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v28.16b, v6.16b, v22.16b\n"
- "sqadd v7.4s, v7.4s, v21.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v2.4s, v2.4s, v15.4s\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v18.16b, v24.16b, v22.16b\n"
+ "sqadd v2.4s, v2.4s, v17.4s\n"
+ "and v17.16b, v31.16b, v23.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v27.16b, v22.16b\n"
+ "sqadd v8.4s, v8.4s, v21.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "sqadd v10.4s, v10.4s, v23.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v9.4s, v9.4s, v19.4s\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v19.4s\n"
- "sqadd v30.4s, v30.4s, v17.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
- "sqadd v6.4s, v6.4s, v28.4s\n"
- "srshl v24.4s, v24.4s, v22.4s\n"
- "sqxtn v9.4h, v9.4s\n"
- "srshl v0.4s, v0.4s, v22.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v23.4s\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v23.4s\n"
"sqxtn v2.4h, v2.4s\n"
- "srshl v6.4s, v6.4s, v22.4s\n"
- "sqxtn v10.4h, v10.4s\n"
- "sqxtn2 v9.8h, v24.4s\n"
- "sqxtn2 v7.8h, v0.4s\n"
- "sqxtn2 v2.8h, v30.4s\n"
- "sqxtn2 v10.8h, v6.4s\n"
- "sqadd v9.8h, v9.8h, v13.8h\n"
- "sqadd v7.8h, v7.8h, v13.8h\n"
- "sqadd v2.8h, v2.8h, v13.8h\n"
- "sqadd v10.8h, v10.8h, v13.8h\n"
- "smax v9.8h, v9.8h, v29.8h\n"
- "smax v7.8h, v7.8h, v29.8h\n"
- "smax v2.8h, v2.8h, v29.8h\n"
- "smax v10.8h, v10.8h, v29.8h\n"
- "smin v9.8h, v9.8h, v12.8h\n"
- "smin v7.8h, v7.8h, v12.8h\n"
- "smin v2.8h, v2.8h, v12.8h\n"
- "smin v10.8h, v10.8h, v12.8h\n"
- "uzp1 v9.16b, v9.16b, v9.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v27.4s, v27.4s, v22.4s\n"
+ "sqxtn v31.4h, v31.4s\n"
+ "sqxtn2 v2.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v24.4s\n"
+ "sqxtn2 v31.8h, v27.4s\n"
+ "sqadd v2.8h, v2.8h, v12.8h\n"
+ "sqadd v8.8h, v8.8h, v12.8h\n"
+ "sqadd v0.8h, v0.8h, v12.8h\n"
+ "sqadd v31.8h, v31.8h, v12.8h\n"
+ "smax v2.8h, v2.8h, v14.8h\n"
+ "smax v8.8h, v8.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v31.8h, v31.8h, v14.8h\n"
+ "smin v2.8h, v2.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v13.8h\n"
+ "smin v0.8h, v0.8h, v13.8h\n"
+ "smin v31.8h, v31.8h, v13.8h\n"
"uzp1 v2.16b, v2.16b, v2.16b\n"
- "uzp1 v10.16b, v10.16b, v10.16b\n"
- "tbz x7, #2, 61f\n"
- "st1 { v9.s }[0], [x11], #0x4\n"
- "st1 { v7.s }[0], [x10], #0x4\n"
- "st1 { v2.s }[0], [x9], #0x4\n"
- "st1 { v10.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 60f\n"
- "st1 { v9.h }[2], [x11], #0x2\n"
- "st1 { v7.h }[2], [x10], #0x2\n"
- "st1 { v2.h }[2], [x9], #0x2\n"
- "st1 { v10.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[6], [x11], #0x1\n"
- "st1 { v7.b }[6], [x10], #0x1\n"
- "st1 { v2.b }[6], [x9], #0x1\n"
- "st1 { v10.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v2.s }[0], [x10], #0x4\n"
+ "st1 { v8.s }[0], [x9], #0x4\n"
+ "st1 { v0.s }[0], [x28], #0x4\n"
+ "st1 { v31.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v2.h }[2], [x10], #0x2\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v0.h }[2], [x28], #0x2\n"
+ "st1 { v31.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[6], [x10], #0x1\n"
+ "st1 { v8.b }[6], [x9], #0x1\n"
+ "st1 { v0.b }[6], [x28], #0x1\n"
+ "st1 { v31.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[4], [x11], #0x1\n"
- "st1 { v7.b }[4], [x10], #0x1\n"
- "st1 { v2.b }[4], [x9], #0x1\n"
- "st1 { v10.b }[4], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[4], [x10], #0x1\n"
+ "st1 { v8.b }[4], [x9], #0x1\n"
+ "st1 { v0.b }[4], [x28], #0x1\n"
+ "st1 { v31.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "st1 { v9.h }[0], [x11], #0x2\n"
- "st1 { v7.h }[0], [x10], #0x2\n"
- "st1 { v2.h }[0], [x9], #0x2\n"
- "st1 { v10.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[2], [x11], #0x1\n"
- "st1 { v7.b }[2], [x10], #0x1\n"
- "st1 { v2.b }[2], [x9], #0x1\n"
- "st1 { v10.b }[2], [x28], #0x1\n"
+ "tbz x8, #1, 62f\n"
+ "st1 { v2.h }[0], [x10], #0x2\n"
+ "st1 { v8.h }[0], [x9], #0x2\n"
+ "st1 { v0.h }[0], [x28], #0x2\n"
+ "st1 { v31.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[2], [x10], #0x1\n"
+ "st1 { v8.b }[2], [x9], #0x1\n"
+ "st1 { v0.b }[2], [x28], #0x1\n"
+ "st1 { v31.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "st1 { v9.b }[0], [x11], #0x1\n"
- "st1 { v7.b }[0], [x10], #0x1\n"
- "st1 { v2.b }[0], [x9], #0x1\n"
- "st1 { v10.b }[0], [x28], #0x1\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v2.b }[0], [x10], #0x1\n"
+ "st1 { v8.b }[0], [x9], #0x1\n"
+ "st1 { v0.b }[0], [x28], #0x1\n"
+ "st1 { v31.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
"64:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index e9db8e1322..34758ed6a3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -55,7 +55,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,1294 +100,1294 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x8, x7, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v6.16b }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x17, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v15.16b }, [x21]\n"
- "ld1r { v13.8h }, [x20]\n"
+ "ld1r { v14.16b }, [x21]\n"
+ "ld1r { v25.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v17.8h }, [x21]\n"
- "ld1r { v24.8h }, [x20]\n"
- "mov x17, #0x0\n"
- "mov x16, #0x0\n"
- "add x15, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x11, x10, [x22, #0x0]\n"
- "ldp x9, x28, [x22, #0x10]\n"
- "cbz x8, 3f\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "subs x8, x8, #0x1\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
+ "ld1r { v23.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x17, 3f\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "subs x17, x17, #0x1\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
"add x20, x20, #0x20\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d25, [x27, x17]\n"
- "ldr d27, [x26, x17]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d1, [x25, x17]\n"
- "ldr d2, [x24, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "ldr d12, [x23, x17]\n"
- "ldr d16, [x22, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "ldr d23, [x21, x17]\n"
- "ldr d10, [x20, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "ldr d18, [x20, x3]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr q30, [x13, #0x0]\n"
- "ldr q29, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d18, [x22, x17]\n"
- "ldr d16, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "usubl v18.8h, v18.8b, v6.8b\n"
- "ldr x21, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q30, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x24, [x5, #0x58]\n"
+ "ldr x23, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x22, [x5, #0x60]\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x24, x3]\n"
+ "ldr x12, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x11, [x5, #0x40]\n"
+ "ldr x10, [x5, #0x70]\n"
+ "add x6, x6, #0x48\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x23, x3]\n"
+ "ldr x9, [x5, #0x98]\n"
+ "subs x17, x17, #0x1\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x28, [x5, #0x50]\n"
+ "ldr x27, [x5, #0x48]\n"
+ "add x7, x7, #0x20\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x22, x3]\n"
+ "ldr x26, [x5, #0x90]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x25, [x5, #0xa8]\n"
+ "ldr x24, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x21, x3]\n"
+ "ldr x23, [x5, #0xb0]\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "ldr x22, [x5, #0xb8]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x21, [x5, #0xc0]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x21, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "usubl v22.8h, v22.8b, v6.8b\n"
- "add x14, x14, #0x48\n"
- "smlal v20.4s, v18.4h, v7.4h\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v30.4s\n"
- "subs x8, x8, #0x1\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v18.8h, v7.8h\n"
- "and v28.16b, v5.16b, v29.16b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x11, x3]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x10, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x9, x3]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x28, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "ldr d27, [x26, x3]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x25, x3]\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x24, x3]\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x23, x3]\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x22, x3]\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "usubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "add x8, x8, #0x20\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "and v2.16b, v19.16b, v30.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "and v11.16b, v1.16b, v27.16b\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v2.4s\n"
+ "and v29.16b, v8.16b, v30.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "and v20.16b, v10.16b, v30.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "and v28.16b, v3.16b, v30.16b\n"
+ "sqadd v1.4s, v1.4s, v11.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v7.16b, v4.16b, v27.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v2.16b, v21.16b, v27.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "add x12, x12, #0x20\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v30.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v30.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v21.16b, v29.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v23.16b, v20.16b, v29.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v9.16b, v19.16b, v29.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v12.4s, v12.4s, #0x1f\n"
- "and v18.16b, v8.16b, v25.16b\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v22.16b, v0.16b, v25.16b\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v12.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v23.4s\n"
+ "and v22.16b, v24.16b, v27.16b\n"
+ "sqadd v8.4s, v8.4s, v29.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v3.4s, v3.4s, v28.4s\n"
"sshr v22.4s, v22.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v9.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v29.4s\n"
- "srshl v21.4s, v21.4s, v29.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v29.4s\n"
- "sqadd v0.4s, v0.4s, v22.4s\n"
- "srshl v19.4s, v19.4s, v29.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v30.4s\n"
+ "sqadd v4.4s, v4.4s, v7.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqadd v21.4s, v21.4s, v2.4s\n"
+ "srshl v3.4s, v3.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v22.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
+ "ldr q19, [x20, #0x0]\n"
+ "ldr q1, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "add x16, x16, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ldr d25, [x27, x17]\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ldr d27, [x26, x17]\n"
- "ldr d1, [x25, x17]\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ldr d2, [x24, x17]\n"
- "ldr d12, [x23, x17]\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "ldr d16, [x22, x17]\n"
- "ldr d23, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "ldr d10, [x20, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "ldr d26, [x27, x3]\n"
+ "ldr d31, [x26, x3]\n"
+ "ldr d20, [x25, x3]\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr d6, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d0, [x21, x3]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "ldr d18, [x20, x3]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr q29, [x13, #0x0]\n"
- "ldr q30, [x12, #0x0]\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x21, [x15, #0x58]\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "ldr x25, [x15, #0x60]\n"
- "ldr x24, [x15, #0x80]\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "ldr d27, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "ldr x23, [x15, #0x68]\n"
- "ldr x22, [x15, #0x88]\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "ldr d25, [x20, x17]\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal v20.4s, v27.4h, v28.4h\n"
- "smlal v19.4s, v25.4h, v18.4h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "ldr d1, [x25, x17]\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "ldr d2, [x24, x17]\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v28.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v18.8h\n"
- "ldr d25, [x22, x17]\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "ldr x25, [x15, #0x98]\n"
- "smlal v20.4s, v1.4h, v11.4h\n"
- "smlal v19.4s, v2.4h, v22.4h\n"
- "ldr x24, [x15, #0x50]\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "ldr d16, [x21, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "ldr d12, [x20, x17]\n"
- "ldr x23, [x15, #0x48]\n"
- "smlal2 v0.4s, v1.8h, v11.8h\n"
- "smlal2 v31.4s, v2.8h, v22.8h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal v20.4s, v27.4h, v18.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x22, [x15, #0xa0]\n"
- "smlal v19.4s, v25.4h, v9.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "ldr d23, [x25, x17]\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "ldr d11, [x24, x17]\n"
- "usubl v11.8h, v11.8b, v6.8b\n"
- "smlal2 v0.4s, v27.8h, v18.8h\n"
- "ldr d27, [x23, x17]\n"
- "smlal2 v31.4s, v25.8h, v9.8h\n"
- "ldr d25, [x21, x17]\n"
- "ldr x21, [x15, #0xb0]\n"
- "smlal v21.4s, v16.4h, v18.4h\n"
- "smlal v20.4s, v12.4h, v22.4h\n"
- "smlal v19.4s, v23.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "ldr d10, [x20, x17]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "usubl v25.8h, v25.8b, v6.8b\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v5.4s, v11.4h, v9.4h\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal2 v8.4s, v16.8h, v18.8h\n"
- "ldr d16, [x22, x17]\n"
- "ldr d18, [x21, x17]\n"
- "smlal2 v0.4s, v12.8h, v22.8h\n"
- "ldr d22, [x20, x17]\n"
- "smlal2 v31.4s, v23.8h, v14.8h\n"
- "ldr q14, [x13, #0x10]\n"
- "smlal v21.4s, v27.4h, v9.4h\n"
- "smlal v20.4s, v25.4h, v26.4h\n"
- "smlal v19.4s, v10.4h, v28.4h\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal2 v3.4s, v11.8h, v9.8h\n"
- "usubl v18.8h, v18.8b, v6.8b\n"
- "smlal v5.4s, v1.4h, v26.4h\n"
- "tst x7, #0x7\n"
- "smlal2 v8.4s, v27.8h, v9.8h\n"
- "ldr d27, [x20, x17]\n"
- "smlal2 v0.4s, v25.8h, v26.8h\n"
- "ldr q25, [x12, #0x10]\n"
- "smlal2 v31.4s, v10.8h, v28.8h\n"
- "smlal v21.4s, v11.4h, v28.4h\n"
- "usubl v22.8h, v22.8b, v6.8b\n"
- "add x17, x17, #0x8\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal v19.4s, v18.4h, v7.4h\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "add x13, x13, #0x20\n"
- "smlal2 v3.4s, v1.8h, v26.8h\n"
- "smlal v5.4s, v12.4h, v7.4h\n"
- "sqrdmulh v5.4s, v5.4s, v29.4s\n"
- "add x12, x12, #0x20\n"
- "smlal2 v8.4s, v11.8h, v28.8h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "and v16.16b, v5.16b, v30.16b\n"
- "smlal2 v31.4s, v18.8h, v7.8h\n"
- "smlal v21.4s, v2.4h, v7.4h\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smlal v20.4s, v10.4h, v9.4h\n"
- "smlal v19.4s, v22.4h, v26.4h\n"
- "sqadd v5.4s, v5.4s, v16.4s\n"
- "smlal2 v3.4s, v12.8h, v7.8h\n"
- "smlal2 v8.4s, v2.8h, v7.8h\n"
- "sqrdmulh v3.4s, v3.4s, v14.4s\n"
- "smlal2 v0.4s, v10.8h, v9.8h\n"
- "smlal2 v31.4s, v22.8h, v26.8h\n"
- "and v16.16b, v3.16b, v25.16b\n"
- "smlal v21.4s, v23.4h, v4.4h\n"
- "smlal v20.4s, v22.4h, v4.4h\n"
- "sqrdmulh v21.4s, v21.4s, v29.4s\n"
- "smlal v19.4s, v27.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "sqrdmulh v20.4s, v20.4s, v29.4s\n"
- "smlal2 v0.4s, v22.8h, v4.8h\n"
- "smlal2 v31.4s, v27.8h, v4.8h\n"
- "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "ldr q30, [x7, #0x0]\n"
+ "ldr q17, [x8, #0x0]\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x24, [x5, #0x78]\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "ldr x23, [x5, #0x60]\n"
+ "ldr x10, [x5, #0x80]\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "ldr q26, [x7, #0x10]\n"
+ "ldr x22, [x5, #0x68]\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "ldr d31, [x20, x3]\n"
+ "ldr x21, [x5, #0x88]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "ldr x20, [x5, #0x40]\n"
+ "ldr x9, [x5, #0x70]\n"
+ "tst x2, #0x7\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "ldr x28, [x5, #0x98]\n"
+ "add x7, x7, #0x20\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x27, [x5, #0x50]\n"
+ "ldr x26, [x5, #0x48]\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "ldr d20, [x23, x3]\n"
+ "ldr x25, [x5, #0x90]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "ldr x24, [x5, #0xa8]\n"
+ "ldr x23, [x5, #0xa0]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "ldr d31, [x10, x3]\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal v3.4s, v28.4h, v27.4h\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "ldr x22, [x5, #0xb0]\n"
+ "smlal2 v24.4s, v28.8h, v27.8h\n"
+ "ldr d28, [x21, x3]\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "ldr x21, [x5, #0xb8]\n"
+ "smlal v10.4s, v20.4h, v16.4h\n"
+ "smlal2 v21.4s, v20.8h, v16.8h\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "ldr d16, [x9, x3]\n"
+ "smlal v3.4s, v31.4h, v11.4h\n"
+ "smlal2 v24.4s, v31.8h, v11.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "ldr d0, [x28, x3]\n"
+ "smlal v10.4s, v6.4h, v27.4h\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "smlal2 v21.4s, v6.8h, v27.8h\n"
+ "ldr d6, [x27, x3]\n"
+ "smlal v8.4s, v9.4h, v27.4h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v4.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x26, x3]\n"
+ "ldr d27, [x25, x3]\n"
+ "smlal v3.4s, v28.4h, v2.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr d28, [x24, x3]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v10.4s, v16.4h, v11.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "ldr d18, [x23, x3]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal2 v21.4s, v16.8h, v11.8h\n"
+ "ldr d11, [x22, x3]\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "smlal v3.4s, v0.4h, v29.4h\n"
+ "smlal v19.4s, v6.4h, v2.4h\n"
+ "smlal2 v24.4s, v0.8h, v29.8h\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v22.4h\n"
+ "smlal2 v1.4s, v6.8h, v2.8h\n"
+ "usubl v11.8h, v11.8b, v13.8b\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v21.4s, v27.8h, v22.8h\n"
+ "ldr q27, [x8, #0x10]\n"
+ "smlal v3.4s, v28.4h, v15.4h\n"
+ "smlal v19.4s, v20.4h, v22.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v24.4s, v28.8h, v15.8h\n"
+ "smlal v8.4s, v6.4h, v15.4h\n"
+ "add x8, x8, #0x20\n"
+ "smlal v10.4s, v18.4h, v5.4h\n"
+ "smlal2 v1.4s, v20.8h, v22.8h\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "smlal2 v4.4s, v6.8h, v15.8h\n"
+ "smlal2 v21.4s, v18.8h, v5.8h\n"
+ "smlal v3.4s, v11.4h, v5.4h\n"
+ "smlal v19.4s, v16.4h, v5.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v8.4s, v31.4h, v5.4h\n"
+ "smlal v10.4s, v28.4h, v2.4h\n"
+ "smlal2 v1.4s, v16.8h, v5.8h\n"
+ "smlal2 v4.4s, v31.8h, v5.8h\n"
+ "smlal2 v21.4s, v28.8h, v2.8h\n"
+ "smlal v3.4s, v29.4h, v22.4h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "smlal2 v24.4s, v29.8h, v22.8h\n"
+ "smlal v8.4s, v0.4h, v7.4h\n"
+ "smlal v10.4s, v29.4h, v7.4h\n"
+ "sqrdmulh v1.4s, v1.4s, v26.4s\n"
+ "and v5.16b, v19.16b, v17.16b\n"
+ "smlal2 v4.4s, v0.8h, v7.8h\n"
+ "smlal2 v21.4s, v29.8h, v7.8h\n"
+ "smlal v3.4s, v9.4h, v7.4h\n"
+ "smlal2 v24.4s, v9.8h, v7.8h\n"
+ "and v16.16b, v1.16b, v27.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqrdmulh v8.4s, v8.4s, v30.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v30.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v23.16b, v21.16b, v30.16b\n"
- "sqrdmulh v8.4s, v8.4s, v14.4s\n"
- "and v27.16b, v20.16b, v30.16b\n"
- "sqrdmulh v0.4s, v0.4s, v14.4s\n"
- "and v22.16b, v19.16b, v30.16b\n"
- "sqrdmulh v31.4s, v31.4s, v14.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v23.4s, v23.4s, #0x1f\n"
- "and v14.16b, v8.16b, v25.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "and v18.16b, v0.16b, v25.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v16.16b, v31.16b, v25.16b\n"
- "sqadd v21.4s, v21.4s, v23.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v27.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v26.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v26.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v30.4s\n"
+ "sqadd v19.4s, v19.4s, v5.4s\n"
+ "and v30.16b, v8.16b, v17.16b\n"
+ "and v20.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v26.4s\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v2.16b, v3.16b, v17.16b\n"
+ "and v11.16b, v4.16b, v27.16b\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v27.16b\n"
+ "and v16.16b, v24.16b, v27.16b\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v30.4s\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v30.4s\n"
- "srshl v21.4s, v21.4s, v30.4s\n"
- "sqadd v8.4s, v8.4s, v14.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "sqadd v0.4s, v0.4s, v18.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v25.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v25.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v25.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqadd v3.4s, v3.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v17.4s\n"
+ "srshl v8.4s, v8.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v11.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "srshl v3.4s, v3.4s, v17.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v27.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "str d5, [x11, x16]\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d21, [x10, x16]\n"
+ "srshl v4.4s, v4.4s, v27.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v27.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d20, [x9, x16]\n"
- "str d19, [x28, x16]\n"
- "add x16, x16, #0x8\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "str d19, [x16, x4]\n"
+ "str d8, [x15, x4]\n"
+ "str d10, [x14, x4]\n"
+ "str d3, [x13, x4]\n"
+ "add x4, x4, #0x8\n"
"beq 88f\n"
- "add x14, x14, #0x48\n"
+ "add x6, x6, #0x48\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x7, #2, 5f\n"
- "ld1 { v5.4s }, [x20], #0x10\n"
- "tbz x7, #1, 4f\n"
- "ld1 { v3.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[2], [x20]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v19.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v1.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v3.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v1.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x7, #1, 6f\n"
- "ld1 { v5.d }[0], [x20], #0x8\n"
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[2], [x20]\n"
+ "tbz x2, #1, 6f\n"
+ "ld1 { v19.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 7f\n"
- "ld1 { v5.s }[0], [x20]\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v19.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d11, [x14, #0x0]\n"
- "ldr d22, [x14, #0x8]\n"
- "mov v21.16b, v5.16b\n"
- "mov v8.16b, v3.16b\n"
- "ldr d14, [x14, #0x10]\n"
- "ldr d28, [x14, #0x18]\n"
- "mov v20.16b, v5.16b\n"
- "mov v0.16b, v3.16b\n"
- "ldr d18, [x14, #0x20]\n"
- "ldr d9, [x14, #0x28]\n"
- "mov v19.16b, v5.16b\n"
- "mov v31.16b, v3.16b\n"
- "ldr d26, [x14, #0x30]\n"
- "ldr d7, [x14, #0x38]\n"
- "ssubl v11.8h, v11.8b, v15.8b\n"
- "ssubl v22.8h, v22.8b, v15.8b\n"
- "ldr d4, [x14, #0x40]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ssubl v14.8h, v14.8b, v15.8b\n"
- "ssubl v28.8h, v28.8b, v15.8b\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ssubl v18.8h, v18.8b, v15.8b\n"
- "ssubl v9.8h, v9.8b, v15.8b\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ssubl v26.8h, v26.8b, v15.8b\n"
- "ssubl v7.8h, v7.8b, v15.8b\n"
- "ssubl v4.8h, v4.8b, v15.8b\n"
- "add x27, x27, x17\n"
- "add x26, x26, x17\n"
- "add x25, x25, x17\n"
- "add x24, x24, x17\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 9f\n"
- "ld1 { v25.s }[0], [x27], #0x4\n"
- "ld1 { v27.s }[0], [x26], #0x4\n"
- "ld1 { v1.s }[0], [x25], #0x4\n"
- "ld1 { v2.s }[0], [x24], #0x4\n"
- "ld1 { v12.s }[0], [x23], #0x4\n"
- "ld1 { v16.s }[0], [x22], #0x4\n"
- "ld1 { v23.s }[0], [x21], #0x4\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 8f\n"
- "ld1 { v25.h }[2], [x27], #0x2\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "ld1 { v1.h }[2], [x25], #0x2\n"
- "ld1 { v2.h }[2], [x24], #0x2\n"
- "ld1 { v12.h }[2], [x23], #0x2\n"
- "ld1 { v16.h }[2], [x22], #0x2\n"
- "ld1 { v23.h }[2], [x21], #0x2\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[6], [x27]\n"
- "ld1 { v27.b }[6], [x26]\n"
- "ld1 { v1.b }[6], [x25]\n"
- "ld1 { v2.b }[6], [x24]\n"
- "ld1 { v12.b }[6], [x23]\n"
- "ld1 { v16.b }[6], [x22]\n"
- "ld1 { v23.b }[6], [x21]\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ldr d16, [x6, #0x0]\n"
+ "ldr d11, [x6, #0x8]\n"
+ "mov v8.16b, v19.16b\n"
+ "mov v4.16b, v1.16b\n"
+ "ldr d29, [x6, #0x10]\n"
+ "ldr d15, [x6, #0x18]\n"
+ "mov v10.16b, v19.16b\n"
+ "mov v21.16b, v1.16b\n"
+ "ldr d27, [x6, #0x20]\n"
+ "ldr d2, [x6, #0x28]\n"
+ "mov v3.16b, v19.16b\n"
+ "mov v24.16b, v1.16b\n"
+ "ldr d22, [x6, #0x30]\n"
+ "ldr d5, [x6, #0x38]\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "ldr d7, [x6, #0x40]\n"
+ "ldp x27, x26, [x5, #0x0]\n"
+ "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldp x25, x24, [x5, #0x10]\n"
+ "ssubl v7.8h, v7.8b, v14.8b\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "ldp x23, x22, [x5, #0x20]\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "ldp x21, x20, [x5, #0x30]\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v20.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v0.s }[0], [x21], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v20.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v0.h }[2], [x21], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v20.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v6.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v0.b }[6], [x21]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[4], [x27]\n"
- "ld1 { v27.b }[4], [x26]\n"
- "ld1 { v1.b }[4], [x25]\n"
- "ld1 { v2.b }[4], [x24]\n"
- "ld1 { v12.b }[4], [x23]\n"
- "ld1 { v16.b }[4], [x22]\n"
- "ld1 { v23.b }[4], [x21]\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v20.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v6.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v0.b }[4], [x21]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x7, #1, 10f\n"
- "ld1 { v25.h }[0], [x27], #0x2\n"
- "ld1 { v27.h }[0], [x26], #0x2\n"
- "ld1 { v1.h }[0], [x25], #0x2\n"
- "ld1 { v2.h }[0], [x24], #0x2\n"
- "ld1 { v12.h }[0], [x23], #0x2\n"
- "ld1 { v16.h }[0], [x22], #0x2\n"
- "ld1 { v23.h }[0], [x21], #0x2\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[2], [x27]\n"
- "ld1 { v27.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x25]\n"
- "ld1 { v2.b }[2], [x24]\n"
- "ld1 { v12.b }[2], [x23]\n"
- "ld1 { v16.b }[2], [x22]\n"
- "ld1 { v23.b }[2], [x21]\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "tbz x2, #1, 10f\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v20.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v0.h }[0], [x21], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v20.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v6.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v0.b }[2], [x21]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 11f\n"
- "ld1 { v25.b }[0], [x27]\n"
- "ld1 { v27.b }[0], [x26]\n"
- "ld1 { v1.b }[0], [x25]\n"
- "ld1 { v2.b }[0], [x24]\n"
- "ld1 { v12.b }[0], [x23]\n"
- "ld1 { v16.b }[0], [x22]\n"
- "ld1 { v23.b }[0], [x21]\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v20.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v6.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v0.b }[0], [x21]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v25.8h, v25.8b, v6.8b\n"
- "smlal v5.4s, v25.4h, v4.4h\n"
- "smlal2 v3.4s, v25.8h, v4.8h\n"
- "ldr x20, [x15, #0x40]\n"
- "usubl v27.8h, v27.8b, v6.8b\n"
- "smlal v5.4s, v27.4h, v11.4h\n"
- "smlal2 v3.4s, v27.8h, v11.8h\n"
- "usubl v1.8h, v1.8b, v6.8b\n"
- "smlal v21.4s, v25.4h, v26.4h\n"
- "smlal2 v8.4s, v25.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "smlal v5.4s, v1.4h, v22.4h\n"
- "smlal2 v3.4s, v1.8h, v22.8h\n"
- "usubl v2.8h, v2.8b, v6.8b\n"
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v21.4s, v2.4h, v22.4h\n"
- "smlal2 v8.4s, v2.8h, v22.8h\n"
- "smlal v5.4s, v16.4h, v28.4h\n"
- "smlal2 v3.4s, v16.8h, v28.8h\n"
- "usubl v12.8h, v12.8b, v6.8b\n"
- "usubl v23.8h, v23.8b, v6.8b\n"
- "smlal v21.4s, v12.4h, v14.4h\n"
- "smlal2 v8.4s, v12.8h, v14.8h\n"
- "smlal v5.4s, v23.4h, v18.4h\n"
- "smlal2 v3.4s, v23.8h, v18.8h\n"
- "usubl v10.8h, v10.8b, v6.8b\n"
- "smlal v20.4s, v25.4h, v14.4h\n"
- "smlal2 v0.4s, v25.8h, v14.8h\n"
- "smlal v19.4s, v25.4h, v11.4h\n"
- "smlal2 v31.4s, v25.8h, v11.8h\n"
- "smlal v5.4s, v10.4h, v14.4h\n"
- "smlal2 v3.4s, v10.8h, v14.8h\n"
- "smlal v21.4s, v10.4h, v11.4h\n"
- "smlal2 v8.4s, v10.8h, v11.8h\n"
- "tbz x7, #2, 13f\n"
- "ld1 { v15.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 12f\n"
- "ld1 { v15.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[6], [x20]\n"
+ "usubl v26.8h, v26.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x40]\n"
+ "usubl v20.8h, v20.8b, v13.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "smlal v19.4s, v26.4h, v7.4h\n"
+ "smlal2 v1.4s, v26.8h, v7.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v4.4s, v26.8h, v22.8h\n"
+ "usubl v18.8h, v18.8b, v13.8b\n"
+ "smlal v10.4s, v26.4h, v29.4h\n"
+ "smlal2 v21.4s, v26.8h, v29.8h\n"
+ "smlal v3.4s, v26.4h, v16.4h\n"
+ "smlal v19.4s, v31.4h, v16.4h\n"
+ "smlal2 v24.4s, v26.8h, v16.8h\n"
+ "smlal2 v1.4s, v31.8h, v16.8h\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v4.4s, v28.8h, v11.8h\n"
+ "smlal v19.4s, v20.4h, v11.4h\n"
+ "smlal2 v1.4s, v20.8h, v11.8h\n"
+ "smlal v8.4s, v6.4h, v29.4h\n"
+ "smlal v19.4s, v9.4h, v15.4h\n"
+ "smlal2 v4.4s, v6.8h, v29.8h\n"
+ "smlal2 v1.4s, v9.8h, v15.8h\n"
+ "smlal v8.4s, v18.4h, v16.4h\n"
+ "smlal v19.4s, v0.4h, v27.4h\n"
+ "smlal2 v4.4s, v18.8h, v16.8h\n"
+ "smlal2 v1.4s, v0.8h, v27.8h\n"
+ "smlal v19.4s, v18.4h, v29.4h\n"
+ "smlal2 v1.4s, v18.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[4], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x7, #1, 14f\n"
- "ld1 { v15.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[2], [x20]\n"
+ "tbz x2, #1, 14f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 15f\n"
- "ld1 { v15.b }[0], [x20]\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v15.8h, v15.8b, v6.8b\n"
- "ldr x20, [x15, #0x48]\n"
- "smlal v21.4s, v15.4h, v18.4h\n"
- "smlal2 v8.4s, v15.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 17f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 16f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x48]\n"
+ "smlal v8.4s, v30.4h, v27.4h\n"
+ "smlal2 v4.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x7, #1, 18f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 18f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 19f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v9.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x50]\n"
- "smlal v21.4s, v16.4h, v9.4h\n"
- "smlal2 v8.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 21f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 20f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x50]\n"
+ "smlal v8.4s, v9.4h, v2.4h\n"
+ "smlal2 v4.4s, v9.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x7, #1, 22f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 22f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 23f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x58]\n"
- "smlal v5.4s, v16.4h, v9.4h\n"
- "smlal2 v3.4s, v16.8h, v9.8h\n"
- "smlal v21.4s, v16.4h, v28.4h\n"
- "smlal2 v8.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 25f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 24f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v19.4s, v17.4h, v2.4h\n"
+ "smlal2 v1.4s, v17.8h, v2.8h\n"
+ "smlal v8.4s, v17.4h, v15.4h\n"
+ "smlal2 v4.4s, v17.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x7, #1, 26f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 26f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 27f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v31.b }[0], [x20]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x60]\n"
- "smlal v20.4s, v16.4h, v28.4h\n"
- "smlal2 v0.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 29f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 28f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v10.4s, v31.4h, v15.4h\n"
+ "smlal2 v21.4s, v31.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x7, #1, 30f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 30f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 31f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x68]\n"
- "smlal v5.4s, v16.4h, v26.4h\n"
- "smlal2 v3.4s, v16.8h, v26.8h\n"
- "smlal v20.4s, v16.4h, v11.4h\n"
- "smlal2 v0.4s, v16.8h, v11.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 33f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 32f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v19.4s, v17.4h, v22.4h\n"
+ "smlal2 v1.4s, v17.8h, v22.8h\n"
+ "smlal v10.4s, v17.4h, v16.4h\n"
+ "smlal2 v21.4s, v17.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x7, #1, 34f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 34f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 35f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v30.b }[0], [x20]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x70]\n"
- "smlal v20.4s, v16.4h, v18.4h\n"
- "smlal2 v0.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 37f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 36f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v10.4s, v30.4h, v27.4h\n"
+ "smlal2 v21.4s, v30.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x7, #1, 38f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 38f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 39f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x78]\n"
- "smlal v5.4s, v16.4h, v7.4h\n"
- "smlal2 v3.4s, v16.8h, v7.8h\n"
- "smlal v20.4s, v16.4h, v22.4h\n"
- "smlal2 v0.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 41f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v19.4s, v17.4h, v5.4h\n"
+ "smlal2 v1.4s, v17.8h, v5.8h\n"
+ "smlal v10.4s, v17.4h, v11.4h\n"
+ "smlal2 v21.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 41f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 40f\n"
+ "tbz x2, #1, 40f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x7, #1, 42f\n"
+ "tbz x2, #1, 42f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 43f\n"
+ "tbz x2, #0, 43f\n"
"ld1 { v16.b }[0], [x20]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x80]\n"
- "smlal v19.4s, v16.4h, v18.4h\n"
- "smlal2 v31.4s, v16.8h, v18.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 45f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 44f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v3.4s, v16.4h, v27.4h\n"
+ "smlal2 v24.4s, v16.8h, v27.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x7, #1, 46f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 46f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 47f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x88]\n"
- "smlal v21.4s, v16.4h, v7.4h\n"
- "smlal2 v8.4s, v16.8h, v7.8h\n"
- "smlal v19.4s, v16.4h, v22.4h\n"
- "smlal2 v31.4s, v16.8h, v22.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 49f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v8.4s, v17.4h, v5.4h\n"
+ "smlal2 v4.4s, v17.8h, v5.8h\n"
+ "smlal v3.4s, v17.4h, v11.4h\n"
+ "smlal2 v24.4s, v17.8h, v11.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 49f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 48f\n"
+ "tbz x2, #1, 48f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x7, #1, 50f\n"
+ "tbz x2, #1, 50f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 51f\n"
+ "tbz x2, #0, 51f\n"
"ld1 { v16.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x90]\n"
- "smlal v19.4s, v16.4h, v9.4h\n"
- "smlal2 v31.4s, v16.8h, v9.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 53f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 52f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v3.4s, v16.4h, v2.4h\n"
+ "smlal2 v24.4s, v16.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x7, #1, 54f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 54f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 55f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0x98]\n"
- "smlal v20.4s, v16.4h, v26.4h\n"
- "smlal2 v0.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 57f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v10.4s, v17.4h, v22.4h\n"
+ "smlal2 v21.4s, v17.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 57f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 56f\n"
+ "tbz x2, #1, 56f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x7, #1, 58f\n"
+ "tbz x2, #1, 58f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 59f\n"
+ "tbz x2, #0, 59f\n"
"ld1 { v16.b }[0], [x20]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa0]\n"
- "smlal v21.4s, v16.4h, v4.4h\n"
- "smlal2 v8.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v14.4h\n"
- "smlal2 v31.4s, v16.8h, v14.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 61f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 60f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v8.4s, v16.4h, v7.4h\n"
+ "smlal2 v4.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v29.4h\n"
+ "smlal2 v24.4s, v16.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x7, #1, 62f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 62f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 63f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xa8]\n"
- "smlal v20.4s, v16.4h, v7.4h\n"
- "smlal2 v0.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 65f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v10.4s, v17.4h, v5.4h\n"
+ "smlal2 v21.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 65f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 64f\n"
+ "tbz x2, #1, 64f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x7, #1, 66f\n"
+ "tbz x2, #1, 66f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 67f\n"
+ "tbz x2, #0, 67f\n"
"ld1 { v16.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb0]\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v0.4s, v16.8h, v9.8h\n"
- "smlal v19.4s, v16.4h, v28.4h\n"
- "smlal2 v31.4s, v16.8h, v28.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 69f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 68f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v10.4s, v16.4h, v2.4h\n"
+ "smlal2 v21.4s, v16.8h, v2.8h\n"
+ "smlal v3.4s, v16.4h, v15.4h\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x7, #1, 70f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 70f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 71f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xb8]\n"
- "smlal v19.4s, v16.4h, v7.4h\n"
- "smlal2 v31.4s, v16.8h, v7.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 73f\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v3.4s, v17.4h, v5.4h\n"
+ "smlal2 v24.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 73f\n"
"ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 72f\n"
+ "tbz x2, #1, 72f\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x7, #1, 74f\n"
+ "tbz x2, #1, 74f\n"
"ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 75f\n"
+ "tbz x2, #0, 75f\n"
"ld1 { v16.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "ldr x20, [x15, #0xc0]\n"
- "smlal v20.4s, v16.4h, v4.4h\n"
- "smlal2 v0.4s, v16.8h, v4.8h\n"
- "smlal v19.4s, v16.4h, v26.4h\n"
- "smlal2 v31.4s, v16.8h, v26.8h\n"
- "add x20, x20, x17\n"
- "tbz x7, #2, 77f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
- "tbz x7, #1, 76f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v10.4s, v16.4h, v7.4h\n"
+ "smlal2 v21.4s, v16.8h, v7.8h\n"
+ "smlal v3.4s, v16.4h, v22.4h\n"
+ "smlal2 v24.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x7, #1, 78f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "tbz x2, #1, 78f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 79f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v17.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v16.8h, v16.8b, v6.8b\n"
- "smlal v19.4s, v16.4h, v4.4h\n"
- "smlal2 v31.4s, v16.8h, v4.8h\n"
- "tbz x7, #2, 81f\n"
- "ld1 { v14.4s }, [x13], #0x10\n"
- "ld1 { v25.4s }, [x12], #0x10\n"
- "tbz x7, #1, 80f\n"
- "ld1 { v18.d }[0], [x13], #0x8\n"
- "ld1 { v12.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x12]\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "smlal v3.4s, v17.4h, v7.4h\n"
+ "smlal2 v24.4s, v17.8h, v7.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v16.4s }, [x7], #0x10\n"
+ "ld1 { v22.4s }, [x8], #0x10\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v0.d }[0], [x7], #0x8\n"
+ "ld1 { v31.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[2], [x7]\n"
+ "ld1 { v31.s }[2], [x8]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v18.s }[0], [x13]\n"
- "ld1 { v12.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v0.s }[0], [x7]\n"
+ "ld1 { v31.s }[0], [x8]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x7, #1, 82f\n"
- "ld1 { v14.d }[0], [x13], #0x8\n"
- "ld1 { v25.d }[0], [x12], #0x8\n"
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[2], [x13]\n"
- "ld1 { v25.s }[2], [x12]\n"
+ "tbz x2, #1, 82f\n"
+ "ld1 { v16.d }[0], [x7], #0x8\n"
+ "ld1 { v22.d }[0], [x8], #0x8\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[2], [x7]\n"
+ "ld1 { v22.s }[2], [x8]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 83f\n"
- "ld1 { v14.s }[0], [x13]\n"
- "ld1 { v25.s }[0], [x12]\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v16.s }[0], [x7]\n"
+ "ld1 { v22.s }[0], [x8]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v28.16b, v5.16b, v25.16b\n"
- "add x11, x11, x16\n"
- "add x10, x10, x16\n"
- "sqrdmulh v3.4s, v3.4s, v18.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "add x9, x9, x16\n"
- "add x28, x28, x16\n"
- "and v16.16b, v3.16b, v12.16b\n"
- "sqrdmulh v21.4s, v21.4s, v14.4s\n"
- "sqrdmulh v20.4s, v20.4s, v14.4s\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v16.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v0.4s\n"
+ "add x16, x16, x4\n"
+ "add x15, x15, x4\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v16.4s\n"
+ "add x14, x14, x4\n"
+ "add x13, x13, x4\n"
+ "sqrdmulh v3.4s, v3.4s, v16.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v0.4s\n"
+ "and v17.16b, v19.16b, v22.16b\n"
+ "and v16.16b, v1.16b, v31.16b\n"
+ "and v15.16b, v8.16b, v22.16b\n"
+ "and v20.16b, v10.16b, v22.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v14.16b, v21.16b, v25.16b\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "and v6.16b, v20.16b, v25.16b\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "and v4.16b, v19.16b, v25.16b\n"
- "sqrdmulh v31.4s, v31.4s, v18.4s\n"
- "sqadd v3.4s, v3.4s, v16.4s\n"
- "sshr v14.4s, v14.4s, #0x1f\n"
- "and v18.16b, v8.16b, v12.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "and v7.16b, v0.16b, v12.16b\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "and v16.16b, v31.16b, v12.16b\n"
- "sqadd v21.4s, v21.4s, v14.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sqadd v20.4s, v20.4s, v6.4s\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v26.16b, v4.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v21.16b, v31.16b\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "and v17.16b, v3.16b, v22.16b\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
+ "and v16.16b, v24.16b, v31.16b\n"
+ "sqadd v8.4s, v8.4s, v15.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "srshl v5.4s, v5.4s, v25.4s\n"
- "srshl v21.4s, v21.4s, v25.4s\n"
- "sqadd v8.4s, v8.4s, v18.4s\n"
- "srshl v20.4s, v20.4s, v25.4s\n"
- "sqadd v0.4s, v0.4s, v7.4s\n"
- "srshl v19.4s, v19.4s, v25.4s\n"
- "sqadd v31.4s, v31.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v12.4s\n"
- "sqxtn v5.4h, v5.4s\n"
- "srshl v8.4s, v8.4s, v12.4s\n"
- "sqxtn v21.4h, v21.4s\n"
- "srshl v0.4s, v0.4s, v12.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v31.4s, v31.4s, v12.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v8.4s, v8.4s, v22.4s\n"
+ "sqadd v3.4s, v3.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v22.4s\n"
+ "sqadd v21.4s, v21.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v31.4s\n"
+ "srshl v3.4s, v3.4s, v22.4s\n"
"sqxtn v19.4h, v19.4s\n"
- "sqxtn2 v5.8h, v3.4s\n"
- "sqxtn2 v21.8h, v8.4s\n"
- "sqxtn2 v20.8h, v0.4s\n"
- "sqxtn2 v19.8h, v31.4s\n"
- "sqadd v5.8h, v5.8h, v13.8h\n"
- "sqadd v21.8h, v21.8h, v13.8h\n"
- "sqadd v20.8h, v20.8h, v13.8h\n"
- "sqadd v19.8h, v19.8h, v13.8h\n"
- "smax v5.8h, v5.8h, v17.8h\n"
- "smax v21.8h, v21.8h, v17.8h\n"
- "smax v20.8h, v20.8h, v17.8h\n"
- "smax v19.8h, v19.8h, v17.8h\n"
- "smin v5.8h, v5.8h, v24.8h\n"
- "smin v21.8h, v21.8h, v24.8h\n"
- "smin v20.8h, v20.8h, v24.8h\n"
- "smin v19.8h, v19.8h, v24.8h\n"
- "uzp1 v5.16b, v5.16b, v5.16b\n"
- "uzp1 v21.16b, v21.16b, v21.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "srshl v4.4s, v4.4s, v31.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "sqxtn2 v19.8h, v1.4s\n"
+ "sqxtn2 v8.8h, v4.4s\n"
+ "sqxtn2 v10.8h, v21.4s\n"
+ "sqxtn2 v3.8h, v24.4s\n"
+ "sqadd v19.8h, v19.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "sqadd v10.8h, v10.8h, v25.8h\n"
+ "sqadd v3.8h, v3.8h, v25.8h\n"
+ "smax v19.8h, v19.8h, v23.8h\n"
+ "smax v8.8h, v8.8h, v23.8h\n"
+ "smax v10.8h, v10.8h, v23.8h\n"
+ "smax v3.8h, v3.8h, v23.8h\n"
+ "smin v19.8h, v19.8h, v12.8h\n"
+ "smin v8.8h, v8.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
- "tbz x7, #2, 85f\n"
- "st1 { v5.s }[0], [x11], #0x4\n"
- "st1 { v21.s }[0], [x10], #0x4\n"
- "st1 { v20.s }[0], [x9], #0x4\n"
- "st1 { v19.s }[0], [x28], #0x4\n"
- "tbz x7, #1, 84f\n"
- "st1 { v5.h }[2], [x11], #0x2\n"
- "st1 { v21.h }[2], [x10], #0x2\n"
- "st1 { v20.h }[2], [x9], #0x2\n"
- "st1 { v19.h }[2], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[6], [x11], #0x1\n"
- "st1 { v21.b }[6], [x10], #0x1\n"
- "st1 { v20.b }[6], [x9], #0x1\n"
- "st1 { v19.b }[6], [x28], #0x1\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "tbz x2, #2, 85f\n"
+ "st1 { v19.s }[0], [x16], #0x4\n"
+ "st1 { v8.s }[0], [x15], #0x4\n"
+ "st1 { v10.s }[0], [x14], #0x4\n"
+ "st1 { v3.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "st1 { v19.h }[2], [x16], #0x2\n"
+ "st1 { v8.h }[2], [x15], #0x2\n"
+ "st1 { v10.h }[2], [x14], #0x2\n"
+ "st1 { v3.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[6], [x16], #0x1\n"
+ "st1 { v8.b }[6], [x15], #0x1\n"
+ "st1 { v10.b }[6], [x14], #0x1\n"
+ "st1 { v3.b }[6], [x13], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[4], [x11], #0x1\n"
- "st1 { v21.b }[4], [x10], #0x1\n"
- "st1 { v20.b }[4], [x9], #0x1\n"
- "st1 { v19.b }[4], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[4], [x16], #0x1\n"
+ "st1 { v8.b }[4], [x15], #0x1\n"
+ "st1 { v10.b }[4], [x14], #0x1\n"
+ "st1 { v3.b }[4], [x13], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x7, #1, 86f\n"
- "st1 { v5.h }[0], [x11], #0x2\n"
- "st1 { v21.h }[0], [x10], #0x2\n"
- "st1 { v20.h }[0], [x9], #0x2\n"
- "st1 { v19.h }[0], [x28], #0x2\n"
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[2], [x11], #0x1\n"
- "st1 { v21.b }[2], [x10], #0x1\n"
- "st1 { v20.b }[2], [x9], #0x1\n"
- "st1 { v19.b }[2], [x28], #0x1\n"
+ "tbz x2, #1, 86f\n"
+ "st1 { v19.h }[0], [x16], #0x2\n"
+ "st1 { v8.h }[0], [x15], #0x2\n"
+ "st1 { v10.h }[0], [x14], #0x2\n"
+ "st1 { v3.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[2], [x16], #0x1\n"
+ "st1 { v8.b }[2], [x15], #0x1\n"
+ "st1 { v10.b }[2], [x14], #0x1\n"
+ "st1 { v3.b }[2], [x13], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x7, #0, 87f\n"
- "st1 { v5.b }[0], [x11], #0x1\n"
- "st1 { v21.b }[0], [x10], #0x1\n"
- "st1 { v20.b }[0], [x9], #0x1\n"
- "st1 { v19.b }[0], [x28], #0x1\n"
+ "tbz x2, #0, 87f\n"
+ "st1 { v19.b }[0], [x16], #0x1\n"
+ "st1 { v8.b }[0], [x15], #0x1\n"
+ "st1 { v10.b }[0], [x14], #0x1\n"
+ "st1 { v3.b }[0], [x13], #0x1\n"
"87:" // Oddments: Bit 2: End
"88:" // End
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index df955206e2..71622239b4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -113,1743 +113,1743 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "lsr x2, x1, #0x3\n"
- "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
- "ld1r { v18.16b }, [x20]\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
"ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "lsr x14, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v15.16b }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_b_offset]\n"
"add x20, x23, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x21]\n"
- "ld1r { v26.8h }, [x20]\n"
+ "ld1r { v9.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
"add x21, x23, %[offsetof_Requantize32_minval]\n"
"add x20, x23, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.8h }, [x21]\n"
- "ld1r { v0.8h }, [x20]\n"
- "mov x3, #0x0\n"
- "mov x4, #0x0\n"
- "add x5, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
- "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "ldp x17, x16, [x22, #0x0]\n"
- "ldp x15, x14, [x22, #0x10]\n"
- "cbz x2, 3f\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "subs x2, x2, #0x1\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ldr d12, [x6, #0x20]\n"
+ "ld1r { v10.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "ldp x8, x17, [x22, #0x0]\n"
+ "ldp x16, x15, [x22, #0x10]\n"
+ "cbz x14, 3f\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "subs x14, x14, #0x1\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
"add x20, x20, #0x20\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ldr d31, [x9, x3]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldr d17, [x28, x3]\n"
- "ldr d30, [x27, x3]\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "ldr d16, [x26, x3]\n"
- "ldr d3, [x25, x3]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "ldr d4, [x24, x3]\n"
- "ldr d25, [x23, x3]\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "ldr d9, [x22, x3]\n"
- "ldr d29, [x21, x3]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "ldr d28, [x20, x3]\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "ldr d26, [x22, x2]\n"
+ "ldr d29, [x21, x2]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr d18, [x20, x2]\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
"beq 2f\n"
"1:" // Loop
- "ldr d2, [x6, #0x28]\n"
- "ldr d27, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d1, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x21, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x20, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v2.4h\n"
- "ldr x20, [x5, #0x90]\n"
- "ldr x23, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x21, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x22, [x5, #0xa0]\n"
- "ldr x21, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v2.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v27.4h\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x20, x3]\n"
- "smlal v20.4s, v16.4h, v2.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal v23.4s, v14.4h, v2.4h\n"
- "ldr x20, [x5, #0xb0]\n"
- "ldr x13, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v27.8h\n"
- "smlal v7.4s, v4.4h, v1.4h\n"
- "ldr x12, [x5, #0xc0]\n"
- "ldr x11, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v2.8h\n"
- "ldr d16, [x23, x3]\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v2.8h\n"
- "ldr d2, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v27.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v27.4h\n"
- "smlal v23.4s, v25.4h, v27.4h\n"
- "ldr x10, [x5, #0xd0]\n"
- "ldr x9, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v1.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x28, [x5, #0xe0]\n"
- "ldr x27, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v27.8h\n"
- "ldr d4, [x22, x3]\n"
- "smlal2 v22.4s, v14.8h, v27.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v27.8h\n"
- "ldr d27, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v1.4h\n"
- "smlal v23.4s, v10.4h, v1.4h\n"
- "ldr x26, [x5, #0xf0]\n"
- "ldr x25, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x24, [x5, #0x100]\n"
- "ldr x23, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v1.8h\n"
- "ldr d17, [x21, x3]\n"
- "smlal2 v22.4s, v25.8h, v1.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v1.8h\n"
- "ldr d1, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x22, [x5, #0x110]\n"
- "ldr x21, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "subs x2, x2, #0x1\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr d3, [x5, #0x28]\n"
+ "ldr d2, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d4, [x5, #0x38]\n"
+ "ldr d22, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x23, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x22, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x21, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x28, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "ldr x27, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ldr x26, [x4, #0x80]\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x25, [x4, #0x88]\n"
+ "ldr x24, [x4, #0x90]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x23, [x4, #0x98]\n"
+ "ldr x22, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x21, x2]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x20, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x21, [x4, #0xa8]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x28, x2]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x27, x2]\n"
+ "ldr x13, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x12, [x4, #0xc0]\n"
+ "ldr x11, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v3.4h\n"
+ "smlal2 v0.4s, v16.8h, v3.8h\n"
+ "ldr d16, [x25, x2]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x24, x2]\n"
+ "ldr x10, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal v1.4s, v18.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x9, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
+ "smlal2 v25.4s, v18.8h, v3.8h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x28, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v2.4h\n"
+ "smlal2 v0.4s, v20.8h, v2.8h\n"
+ "ldr d20, [x23, x2]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "ldr d3, [x5, #0x60]\n"
+ "ldr x27, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v2.4h\n"
+ "smlal v1.4s, v17.4h, v2.4h\n"
+ "ldr x26, [x4, #0xf0]\n"
+ "ldr x25, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v2.8h\n"
+ "smlal2 v25.4s, v17.8h, v2.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x24, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "ldr d19, [x22, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v2.4h\n"
+ "smlal2 v30.4s, v26.8h, v2.8h\n"
+ "ldr d2, [x5, #0x68]\n"
+ "ldr x23, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v4.4h\n"
+ "smlal v1.4s, v26.4h, v4.4h\n"
+ "ldr x22, [x4, #0x110]\n"
+ "subs x14, x14, #0x1\n"
+ "smlal2 v6.4s, v12.8h, v4.8h\n"
+ "smlal2 v25.4s, v26.8h, v4.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v22.4h\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v11.8h, v4.8h\n"
+ "ldr d4, [x5, #0x70]\n"
+ "ldr x21, [x4, #0x118]\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal v1.4s, v11.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal2 v25.4s, v11.8h, v22.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "ldr d22, [x5, #0x78]\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x13, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x12, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x11, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v10.4h, v27.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x10, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v2.4h\n"
- "smlal v23.4s, v17.4h, v2.4h\n"
- "smlal2 v15.4s, v10.8h, v27.8h\n"
- "smlal v7.4s, v9.4h, v1.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "ldr d10, [x9, x3]\n"
- "smlal2 v22.4s, v4.8h, v2.8h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v2.8h\n"
- "ldr d2, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v27.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v27.4h\n"
- "smlal v23.4s, v6.4h, v27.4h\n"
- "smlal2 v15.4s, v9.8h, v1.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v27.8h\n"
- "ldr d9, [x28, x3]\n"
- "smlal2 v22.4s, v17.8h, v27.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v27.8h\n"
- "ldr d27, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v1.4h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x27, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v1.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v1.8h\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "ldr d1, [x26, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "usubl v1.8h, v1.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x25, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x24, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v2.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x23, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "add x6, x6, #0xc8\n"
- "smlal2 v15.4s, v6.8h, v2.8h\n"
- "smlal v7.4s, v8.4h, v27.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x22, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal v20.4s, v28.4h, v2.4h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v2.4h\n"
- "smlal v23.4s, v12.4h, v2.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v27.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v2.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v2.8h\n"
- "smlal2 v19.4s, v12.8h, v2.8h\n"
- "ldr q2, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v27.4h\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x13, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x12, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x11, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v3.4h\n"
+ "smlal2 v0.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x10, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v3.4h\n"
+ "smlal v1.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v11.8h, v3.8h\n"
+ "smlal2 v25.4s, v19.8h, v3.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v2.4h\n"
+ "smlal2 v0.4s, v11.8h, v2.8h\n"
+ "ldr d11, [x9, x2]\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v3.4h\n"
+ "smlal2 v30.4s, v12.8h, v3.8h\n"
+ "ldr d3, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v2.4h\n"
+ "smlal v1.4s, v12.4h, v2.4h\n"
+ "smlal2 v6.4s, v28.8h, v2.8h\n"
+ "smlal2 v25.4s, v12.8h, v2.8h\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v4.4h\n"
+ "smlal2 v0.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v2.4h\n"
+ "smlal2 v30.4s, v7.8h, v2.8h\n"
+ "ldr d2, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v1.4s, v7.4h, v4.4h\n"
+ "smlal2 v6.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x27, x2]\n"
+ "smlal2 v25.4s, v7.8h, v4.8h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v22.4h\n"
+ "smlal2 v0.4s, v23.8h, v22.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v4.4h\n"
+ "smlal2 v30.4s, v29.8h, v4.8h\n"
+ "ldr d4, [x26, x2]\n"
+ "smlal v27.4s, v20.4h, v22.4h\n"
+ "smlal v1.4s, v24.4h, v22.4h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v22.8h\n"
+ "smlal2 v25.4s, v24.8h, v22.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x25, x2]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v22.4h\n"
+ "smlal2 v30.4s, v17.8h, v22.8h\n"
+ "ldr d22, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x24, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x23, x2]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "add x5, x5, #0xc8\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v3.4h\n"
+ "smlal2 v0.4s, v7.8h, v3.8h\n"
+ "ldr d7, [x22, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x21, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v3.4h\n"
+ "smlal v1.4s, v28.4h, v3.4h\n"
+ "smlal2 v6.4s, v29.8h, v3.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v3.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v2.4h\n"
+ "smlal2 v0.4s, v24.8h, v2.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v3.4h\n"
+ "smlal2 v30.4s, v16.8h, v3.8h\n"
+ "ldr q3, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v2.4h\n"
+ "smlal v1.4s, v4.4h, v2.4h\n"
+ "smlal2 v6.4s, v17.8h, v2.8h\n"
+ "smlal2 v25.4s, v4.8h, v2.8h\n"
+ "ldr q4, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v1.4h, v27.4h\n"
- "smlal v23.4s, v16.4h, v27.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v27.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v1.8h, v27.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v27.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v27.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v27.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v2.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v9.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "and v25.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v2.4s\n"
- "and v10.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v2.4s\n"
- "and v21.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v2.4s\n"
- "sqadd v15.4s, v15.4s, v9.4s\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v9.16b, v5.16b, v14.16b\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "and v12.16b, v22.16b, v14.16b\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v17.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v10.4s\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v2.4h\n"
+ "smlal2 v30.4s, v20.8h, v2.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v22.4h\n"
+ "smlal2 v0.4s, v26.8h, v22.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v22.4h\n"
+ "smlal v1.4s, v19.4h, v22.4h\n"
+ "smlal2 v6.4s, v11.8h, v22.8h\n"
+ "smlal2 v25.4s, v19.8h, v22.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v22.4h\n"
+ "smlal2 v30.4s, v12.8h, v22.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v12.16b, v0.16b, v4.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v3.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v21.4s\n"
+ "and v21.16b, v27.16b, v24.16b\n"
+ "and v16.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v28.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v12.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v4.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v17.16b, v25.16b, v4.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v3.16b, v30.16b, v4.16b\n"
+ "sqadd v27.4s, v27.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v16.4s\n"
"sshr v17.4s, v17.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v12.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v17.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q15, [x20, #0x10]\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v3.4s\n"
+ "srshl v0.4s, v0.4s, v4.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v4.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
"add x20, x20, #0x20\n"
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "add x4, x4, #0x8\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldr d31, [x9, x3]\n"
- "ldr d17, [x28, x3]\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr d30, [x27, x3]\n"
- "ldr d16, [x26, x3]\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "usubl v31.8h, v31.8b, v18.8b\n"
- "ldr d3, [x25, x3]\n"
- "ldr d4, [x24, x3]\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "ldr d25, [x23, x3]\n"
- "ldr d9, [x22, x3]\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "ldr d29, [x21, x3]\n"
- "ldr d28, [x20, x3]\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "ldr d24, [x9, x2]\n"
+ "ldr d21, [x28, x2]\n"
+ "ldr d16, [x27, x2]\n"
+ "ldr d20, [x26, x2]\n"
+ "ldr d7, [x25, x2]\n"
+ "ldr d19, [x24, x2]\n"
+ "ldr d28, [x23, x2]\n"
+ "ldr d26, [x22, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr d29, [x21, x2]\n"
+ "ldr d18, [x20, x2]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
"bgt 1b\n"
"2:" // Tail
- "ldr d27, [x6, #0x28]\n"
- "ldr d1, [x6, #0x30]\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "ldr d2, [x6, #0x38]\n"
- "ldr d31, [x6, #0x40]\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "ldr d8, [x6, #0x48]\n"
- "ldr x22, [x5, #0x50]\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "ldr x20, [x5, #0x58]\n"
- "ldr x21, [x5, #0x60]\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "ldr d6, [x20, x3]\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x22, [x5, #0x70]\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "ldr d3, [x21, x3]\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "ldr d14, [x20, x3]\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal v23.4s, v17.4h, v10.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x21, [x5, #0x78]\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "ldr d25, [x22, x3]\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v10.8h\n"
- "ldr d10, [x21, x3]\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal v24.4s, v17.4h, v21.4h\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x24, [x5, #0x88]\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "smlal v7.4s, v30.4h, v27.4h\n"
- "ldr x23, [x5, #0x90]\n"
- "ldr x22, [x5, #0x98]\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "ldr d9, [x20, x3]\n"
- "smlal2 v22.4s, v17.8h, v21.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "ldr d21, [x6, #0x50]\n"
- "smlal v20.4s, v3.4h, v12.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "ldr x21, [x5, #0xa0]\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal2 v15.4s, v30.8h, v27.8h\n"
- "ldr d30, [x24, x3]\n"
- "smlal v7.4s, v16.4h, v1.4h\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v12.8h\n"
- "ldr d3, [x6, #0x58]\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "ldr d12, [x23, x3]\n"
- "smlal v20.4s, v16.4h, v27.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal v24.4s, v28.4h, v27.4h\n"
- "smlal v23.4s, v14.4h, v27.4h\n"
- "ldr x13, [x5, #0xb0]\n"
- "ldr x12, [x5, #0xb8]\n"
- "smlal2 v15.4s, v16.8h, v1.8h\n"
- "smlal v7.4s, v4.4h, v2.4h\n"
- "ldr x11, [x5, #0xc0]\n"
- "ldr x10, [x5, #0xc8]\n"
- "smlal2 v5.4s, v16.8h, v27.8h\n"
- "ldr d16, [x22, x3]\n"
- "smlal2 v22.4s, v28.8h, v27.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v27.8h\n"
- "ldr d27, [x6, #0x60]\n"
- "smlal v20.4s, v4.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v1.4h\n"
- "smlal v23.4s, v25.4h, v1.4h\n"
- "ldr x9, [x5, #0xd0]\n"
- "ldr x28, [x5, #0xd8]\n"
- "smlal2 v15.4s, v4.8h, v2.8h\n"
- "smlal v7.4s, v17.4h, v31.4h\n"
- "ldr x27, [x5, #0xe0]\n"
- "ldr x26, [x5, #0xe8]\n"
- "smlal2 v5.4s, v4.8h, v1.8h\n"
- "ldr d4, [x21, x3]\n"
- "smlal2 v22.4s, v14.8h, v1.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v1.8h\n"
- "ldr d1, [x6, #0x68]\n"
- "smlal v20.4s, v17.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v2.4h\n"
- "smlal v23.4s, v10.4h, v2.4h\n"
- "ldr x25, [x5, #0xf0]\n"
- "ldr x24, [x5, #0xf8]\n"
- "smlal2 v15.4s, v17.8h, v31.8h\n"
- "smlal v7.4s, v6.4h, v8.4h\n"
- "ldr x23, [x5, #0x100]\n"
- "ldr x22, [x5, #0x108]\n"
- "smlal2 v5.4s, v17.8h, v2.8h\n"
- "ldr d17, [x20, x3]\n"
- "smlal2 v22.4s, v25.8h, v2.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v2.8h\n"
- "ldr d2, [x6, #0x70]\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v31.4h\n"
- "smlal v23.4s, v9.4h, v31.4h\n"
- "ldr x21, [x5, #0x110]\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal2 v15.4s, v6.8h, v8.8h\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
+ "ldr d4, [x5, #0x28]\n"
+ "ldr d3, [x5, #0x30]\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "ldr d2, [x5, #0x40]\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "ldr d24, [x5, #0x48]\n"
+ "ldr x21, [x4, #0x50]\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x50]\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "ldr d21, [x5, #0x58]\n"
+ "ldr x28, [x4, #0x60]\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "ldr x27, [x4, #0x68]\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "ldr x26, [x4, #0x70]\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "ldr d12, [x21, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "ldr d7, [x20, x2]\n"
+ "ldr x25, [x4, #0x78]\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "ldr x24, [x4, #0x80]\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "ldr x23, [x4, #0x88]\n"
+ "ldr x22, [x4, #0x90]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
+ "ldr x21, [x4, #0x98]\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "ldr d28, [x28, x2]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v17.4h\n"
+ "smlal2 v30.4s, v12.8h, v17.8h\n"
+ "ldr d17, [x27, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal v1.4s, v12.4h, v11.4h\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "ldr x14, [x4, #0xa8]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal2 v25.4s, v12.8h, v11.8h\n"
+ "ldr x13, [x4, #0xb0]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x26, x2]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v7.4h, v11.4h\n"
+ "smlal2 v30.4s, v7.8h, v11.8h\n"
+ "ldr d11, [x25, x2]\n"
+ "ldr x12, [x4, #0xb8]\n"
+ "smlal v27.4s, v28.4h, v23.4h\n"
+ "smlal v1.4s, v7.4h, v23.4h\n"
+ "ldr x11, [x4, #0xc0]\n"
+ "ldr x10, [x4, #0xc8]\n"
+ "smlal2 v6.4s, v28.8h, v23.8h\n"
+ "ldr d28, [x24, x2]\n"
+ "smlal2 v25.4s, v7.8h, v23.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "ldr d16, [x23, x2]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "ldr d23, [x22, x2]\n"
+ "ldr x9, [x4, #0xd0]\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal v1.4s, v18.4h, v4.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x28, [x4, #0xd8]\n"
+ "smlal2 v6.4s, v20.8h, v4.8h\n"
+ "smlal2 v25.4s, v18.8h, v4.8h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x27, [x4, #0xe0]\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "ldr d20, [x21, x2]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v4.4h\n"
+ "smlal2 v30.4s, v17.8h, v4.8h\n"
+ "ldr d4, [x5, #0x60]\n"
+ "ldr x26, [x4, #0xe8]\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal v1.4s, v17.4h, v3.4h\n"
+ "ldr x25, [x4, #0xf0]\n"
+ "ldr x24, [x4, #0xf8]\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal2 v25.4s, v17.8h, v3.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x23, [x4, #0x100]\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "ldr d19, [x20, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v3.4h\n"
+ "smlal2 v30.4s, v26.8h, v3.8h\n"
+ "ldr d3, [x5, #0x68]\n"
+ "ldr x22, [x4, #0x108]\n"
+ "smlal v27.4s, v12.4h, v22.4h\n"
+ "smlal v1.4s, v26.4h, v22.4h\n"
+ "ldr x21, [x4, #0x110]\n"
+ "ldr x20, [x4, #0x118]\n"
+ "smlal2 v6.4s, v12.8h, v22.8h\n"
+ "smlal2 v25.4s, v26.8h, v22.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
"tst x1, #0x7\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "ldr d6, [x13, x3]\n"
- "smlal2 v22.4s, v10.8h, v31.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v31.8h\n"
- "ldr d31, [x6, #0x78]\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v9.4h, v8.4h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "ldr d28, [x12, x3]\n"
- "smlal v7.4s, v14.4h, v3.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v5.4s, v29.8h, v8.8h\n"
- "ldr d29, [x6, #0x80]\n"
- "smlal2 v22.4s, v9.8h, v8.8h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "ldr d8, [x11, x3]\n"
- "smlal v20.4s, v14.4h, v21.4h\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "smlal v24.4s, v12.4h, v21.4h\n"
- "smlal v23.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v14.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v27.4h\n"
- "smlal2 v5.4s, v14.8h, v21.8h\n"
- "ldr d14, [x10, x3]\n"
- "smlal2 v22.4s, v12.8h, v21.8h\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "smlal2 v19.4s, v16.8h, v21.8h\n"
- "ldr d21, [x6, #0x88]\n"
- "smlal v20.4s, v25.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v16.4h, v3.4h\n"
- "smlal v23.4s, v4.4h, v3.4h\n"
- "smlal2 v15.4s, v25.8h, v27.8h\n"
- "smlal v7.4s, v10.4h, v1.4h\n"
- "smlal2 v5.4s, v25.8h, v3.8h\n"
- "ldr d25, [x9, x3]\n"
- "smlal2 v22.4s, v16.8h, v3.8h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v3.8h\n"
- "ldr d3, [x6, #0x90]\n"
- "smlal v20.4s, v10.4h, v27.4h\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal v24.4s, v4.4h, v27.4h\n"
- "smlal v23.4s, v17.4h, v27.4h\n"
- "smlal2 v15.4s, v10.8h, v1.8h\n"
- "smlal v7.4s, v9.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v27.8h\n"
- "ldr d10, [x28, x3]\n"
- "smlal2 v22.4s, v4.8h, v27.8h\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "smlal2 v19.4s, v17.8h, v27.8h\n"
- "ldr d27, [x6, #0x98]\n"
- "smlal v20.4s, v9.4h, v1.4h\n"
- "ssubl v27.8h, v27.8b, v13.8b\n"
- "smlal v24.4s, v17.4h, v1.4h\n"
- "smlal v23.4s, v6.4h, v1.4h\n"
- "smlal2 v15.4s, v9.8h, v2.8h\n"
- "smlal v7.4s, v12.4h, v31.4h\n"
- "smlal2 v5.4s, v9.8h, v1.8h\n"
- "ldr d9, [x27, x3]\n"
- "smlal2 v22.4s, v17.8h, v1.8h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v19.4s, v6.8h, v1.8h\n"
- "ldr d1, [x6, #0xa0]\n"
- "smlal v20.4s, v30.4h, v2.4h\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal v24.4s, v6.4h, v2.4h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v15.4s, v12.8h, v31.8h\n"
- "ldr d12, [x26, x3]\n"
- "smlal v7.4s, v16.4h, v29.4h\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d30, [x6, #0xa8]\n"
- "smlal2 v22.4s, v6.8h, v2.8h\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "smlal2 v19.4s, v28.8h, v2.8h\n"
- "ldr d2, [x25, x3]\n"
- "smlal v20.4s, v16.4h, v31.4h\n"
- "usubl v2.8h, v2.8b, v18.8b\n"
- "smlal v24.4s, v8.4h, v31.4h\n"
- "smlal v23.4s, v14.4h, v31.4h\n"
- "smlal2 v15.4s, v16.8h, v29.8h\n"
- "smlal v7.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v16.8h, v31.8h\n"
- "ldr d16, [x24, x3]\n"
- "smlal2 v22.4s, v8.8h, v31.8h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "smlal2 v19.4s, v14.8h, v31.8h\n"
- "ldr d31, [x6, #0xb0]\n"
- "smlal v20.4s, v4.4h, v29.4h\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "smlal v24.4s, v14.4h, v29.4h\n"
- "smlal v23.4s, v25.4h, v29.4h\n"
- "smlal2 v15.4s, v4.8h, v21.8h\n"
- "smlal v7.4s, v17.4h, v3.4h\n"
- "smlal2 v5.4s, v4.8h, v29.8h\n"
- "ldr d4, [x23, x3]\n"
- "smlal2 v22.4s, v14.8h, v29.8h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v19.4s, v25.8h, v29.8h\n"
- "ldr d29, [x6, #0xb8]\n"
- "smlal v20.4s, v17.4h, v21.4h\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v15.4s, v17.8h, v3.8h\n"
- "smlal v7.4s, v6.4h, v27.4h\n"
- "smlal2 v5.4s, v17.8h, v21.8h\n"
- "ldr d17, [x22, x3]\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "ldr d21, [x6, #0xc0]\n"
- "smlal v20.4s, v6.4h, v3.4h\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "smlal v24.4s, v10.4h, v3.4h\n"
- "smlal v23.4s, v9.4h, v3.4h\n"
- "smlal2 v15.4s, v6.8h, v27.8h\n"
- "smlal v7.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v6.8h, v3.8h\n"
- "ldr d6, [x21, x3]\n"
- "smlal2 v22.4s, v10.8h, v3.8h\n"
- "usubl v6.8h, v6.8b, v18.8b\n"
- "smlal2 v19.4s, v9.8h, v3.8h\n"
- "ldr d3, [x20, x3]\n"
- "smlal v20.4s, v28.4h, v27.4h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v24.4s, v9.4h, v27.4h\n"
- "smlal v23.4s, v12.4h, v27.4h\n"
- "add x3, x3, #0x8\n"
- "smlal2 v15.4s, v8.8h, v1.8h\n"
- "ldr q8, [x7, #0x0]\n"
- "smlal v7.4s, v14.4h, v30.4h\n"
- "smlal2 v5.4s, v28.8h, v27.8h\n"
- "ldr q28, [x8, #0x0]\n"
- "smlal2 v22.4s, v9.8h, v27.8h\n"
- "smlal2 v19.4s, v12.8h, v27.8h\n"
- "ldr q27, [x7, #0x10]\n"
- "smlal v20.4s, v14.4h, v1.4h\n"
+ "smlal v8.4s, v12.4h, v2.4h\n"
+ "smlal2 v0.4s, v12.8h, v2.8h\n"
+ "ldr d12, [x14, x2]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "smlal v27.4s, v7.4h, v2.4h\n"
+ "smlal v1.4s, v11.4h, v2.4h\n"
+ "smlal2 v6.4s, v7.8h, v2.8h\n"
+ "smlal2 v25.4s, v11.8h, v2.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v24.4h\n"
+ "smlal2 v0.4s, v7.8h, v24.8h\n"
+ "ldr d7, [x13, x2]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v2.4h\n"
+ "smlal2 v30.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x5, #0x78]\n"
+ "smlal v27.4s, v29.4h, v24.4h\n"
+ "smlal v1.4s, v28.4h, v24.4h\n"
+ "smlal2 v6.4s, v29.8h, v24.8h\n"
+ "ldr d29, [x12, x2]\n"
+ "smlal2 v25.4s, v28.8h, v24.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v18.4h, v31.4h\n"
+ "smlal2 v0.4s, v18.8h, v31.8h\n"
+ "ldr d18, [x5, #0x80]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v24.4h\n"
+ "smlal2 v30.4s, v16.8h, v24.8h\n"
+ "ldr d24, [x11, x2]\n"
+ "smlal v27.4s, v17.4h, v31.4h\n"
+ "smlal v1.4s, v23.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v6.4s, v17.8h, v31.8h\n"
+ "smlal2 v25.4s, v23.8h, v31.8h\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v8.4s, v17.4h, v21.4h\n"
+ "smlal2 v0.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x10, x2]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "ldr d31, [x5, #0x88]\n"
+ "smlal v27.4s, v26.4h, v21.4h\n"
+ "smlal v1.4s, v20.4h, v21.4h\n"
+ "smlal2 v6.4s, v26.8h, v21.8h\n"
+ "smlal2 v25.4s, v20.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v8.4s, v26.4h, v4.4h\n"
+ "smlal2 v0.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x9, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v21.4h\n"
+ "smlal2 v30.4s, v19.8h, v21.8h\n"
+ "ldr d21, [x5, #0x90]\n"
+ "smlal v27.4s, v11.4h, v4.4h\n"
+ "smlal v1.4s, v19.4h, v4.4h\n"
+ "smlal2 v6.4s, v11.8h, v4.8h\n"
+ "smlal2 v25.4s, v19.8h, v4.8h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v0.4s, v11.8h, v3.8h\n"
+ "ldr d11, [x28, x2]\n"
+ "ssubl v21.8h, v21.8b, v9.8b\n"
+ "smlal v5.4s, v12.4h, v4.4h\n"
+ "smlal2 v30.4s, v12.8h, v4.8h\n"
+ "ldr d4, [x5, #0x98]\n"
+ "smlal v27.4s, v28.4h, v3.4h\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "ldr d28, [x27, x2]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v3.4h\n"
+ "smlal2 v30.4s, v7.8h, v3.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "smlal v27.4s, v16.4h, v22.4h\n"
+ "smlal v1.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v16.8h, v22.8h\n"
+ "ldr d16, [x26, x2]\n"
+ "smlal2 v25.4s, v7.8h, v22.8h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "smlal v8.4s, v23.4h, v2.4h\n"
+ "smlal2 v0.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x5, #0xa8]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v22.4h\n"
+ "smlal2 v30.4s, v29.8h, v22.8h\n"
+ "ldr d22, [x25, x2]\n"
+ "smlal v27.4s, v20.4h, v2.4h\n"
+ "smlal v1.4s, v24.4h, v2.4h\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "smlal2 v6.4s, v20.8h, v2.8h\n"
+ "smlal2 v25.4s, v24.8h, v2.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "ldr d20, [x24, x2]\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v2.4h\n"
+ "smlal2 v30.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x5, #0xb0]\n"
+ "smlal v27.4s, v19.4h, v18.4h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v6.4s, v19.8h, v18.8h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "smlal v8.4s, v19.4h, v31.4h\n"
+ "smlal2 v0.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x23, x2]\n"
+ "ssubl v2.8h, v2.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v18.4h\n"
+ "smlal2 v30.4s, v26.8h, v18.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "smlal v27.4s, v12.4h, v31.4h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v6.4s, v12.8h, v31.8h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v12.4h, v21.4h\n"
+ "smlal2 v0.4s, v12.8h, v21.8h\n"
+ "ldr d12, [x22, x2]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d31, [x5, #0xc0]\n"
+ "smlal v27.4s, v7.4h, v21.4h\n"
+ "smlal v1.4s, v11.4h, v21.4h\n"
+ "smlal2 v6.4s, v7.8h, v21.8h\n"
+ "smlal2 v25.4s, v11.8h, v21.8h\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "smlal v8.4s, v7.4h, v4.4h\n"
+ "smlal2 v0.4s, v7.8h, v4.8h\n"
+ "ldr d7, [x21, x2]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v21.4h\n"
+ "smlal2 v30.4s, v28.8h, v21.8h\n"
+ "ldr d21, [x20, x2]\n"
+ "add x2, x2, #0x8\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal v1.4s, v28.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "ldr q29, [x6, #0x0]\n"
+ "smlal2 v25.4s, v28.8h, v4.8h\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal2 v0.4s, v24.8h, v3.8h\n"
+ "ldr q24, [x7, #0x0]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "smlal v5.4s, v16.4h, v4.4h\n"
+ "smlal2 v30.4s, v16.8h, v4.8h\n"
+ "ldr q4, [x6, #0x10]\n"
+ "add x6, x6, #0x20\n"
+ "smlal v27.4s, v17.4h, v3.4h\n"
+ "smlal v1.4s, v22.4h, v3.4h\n"
+ "smlal2 v6.4s, v17.8h, v3.8h\n"
+ "smlal2 v25.4s, v22.8h, v3.8h\n"
+ "ldr q22, [x7, #0x10]\n"
"add x7, x7, #0x20\n"
- "smlal v24.4s, v2.4h, v1.4h\n"
- "smlal v23.4s, v16.4h, v1.4h\n"
- "smlal2 v15.4s, v14.8h, v30.8h\n"
- "smlal v7.4s, v25.4h, v31.4h\n"
- "smlal2 v5.4s, v14.8h, v1.8h\n"
- "ldr q14, [x8, #0x10]\n"
- "smlal2 v22.4s, v2.8h, v1.8h\n"
- "add x8, x8, #0x20\n"
- "smlal2 v19.4s, v16.8h, v1.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal v24.4s, v16.4h, v30.4h\n"
- "smlal v23.4s, v4.4h, v30.4h\n"
- "smlal2 v15.4s, v25.8h, v31.8h\n"
- "smlal v7.4s, v10.4h, v29.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal2 v22.4s, v16.8h, v30.8h\n"
- "smlal2 v19.4s, v4.8h, v30.8h\n"
- "smlal v20.4s, v10.4h, v31.4h\n"
- "smlal v24.4s, v4.4h, v31.4h\n"
- "smlal v23.4s, v17.4h, v31.4h\n"
- "smlal2 v15.4s, v10.8h, v29.8h\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "sqrdmulh v7.4s, v7.4s, v8.4s\n"
- "smlal2 v5.4s, v10.8h, v31.8h\n"
- "smlal2 v22.4s, v4.8h, v31.8h\n"
- "and v4.16b, v7.16b, v28.16b\n"
- "smlal2 v19.4s, v17.8h, v31.8h\n"
- "smlal v20.4s, v9.4h, v29.4h\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "smlal v24.4s, v17.4h, v29.4h\n"
- "smlal v23.4s, v6.4h, v29.4h\n"
- "sqadd v7.4s, v7.4s, v4.4s\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal2 v5.4s, v9.8h, v29.8h\n"
- "sqrdmulh v15.4s, v15.4s, v27.4s\n"
- "smlal2 v22.4s, v17.8h, v29.8h\n"
- "smlal2 v19.4s, v6.8h, v29.8h\n"
- "and v30.16b, v15.16b, v14.16b\n"
- "smlal v20.4s, v12.4h, v21.4h\n"
- "smlal v24.4s, v6.4h, v21.4h\n"
- "sqrdmulh v20.4s, v20.4s, v8.4s\n"
- "smlal v23.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v12.8h, v21.8h\n"
- "sqrdmulh v24.4s, v24.4s, v8.4s\n"
- "smlal2 v22.4s, v6.8h, v21.8h\n"
- "smlal2 v19.4s, v3.8h, v21.8h\n"
- "sqrdmulh v23.4s, v23.4s, v8.4s\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "and v3.16b, v20.16b, v28.16b\n"
- "sqrdmulh v5.4s, v5.4s, v27.4s\n"
- "and v25.16b, v24.16b, v28.16b\n"
- "sqrdmulh v22.4s, v22.4s, v27.4s\n"
- "and v16.16b, v23.16b, v28.16b\n"
- "sqrdmulh v19.4s, v19.4s, v27.4s\n"
- "sqadd v15.4s, v15.4s, v30.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "and v4.16b, v5.16b, v14.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "and v10.16b, v22.16b, v14.16b\n"
+ "smlal v8.4s, v17.4h, v23.4h\n"
+ "smlal2 v0.4s, v17.8h, v23.8h\n"
+ "smlal v5.4s, v20.4h, v3.4h\n"
+ "smlal2 v30.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v26.4h, v23.4h\n"
+ "smlal v1.4s, v20.4h, v23.4h\n"
+ "smlal2 v6.4s, v26.8h, v23.8h\n"
+ "smlal2 v25.4s, v20.8h, v23.8h\n"
+ "smlal v8.4s, v26.4h, v2.4h\n"
+ "smlal2 v0.4s, v26.8h, v2.8h\n"
+ "smlal v5.4s, v19.4h, v23.4h\n"
+ "smlal2 v30.4s, v19.8h, v23.8h\n"
+ "smlal v27.4s, v11.4h, v2.4h\n"
+ "smlal v1.4s, v19.4h, v2.4h\n"
+ "smlal2 v6.4s, v11.8h, v2.8h\n"
+ "smlal2 v25.4s, v19.8h, v2.8h\n"
+ "smlal v8.4s, v11.4h, v18.4h\n"
+ "smlal2 v0.4s, v11.8h, v18.8h\n"
+ "smlal v5.4s, v12.4h, v2.4h\n"
+ "smlal2 v30.4s, v12.8h, v2.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal v1.4s, v12.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal2 v25.4s, v12.8h, v18.8h\n"
+ "smlal v8.4s, v28.4h, v31.4h\n"
+ "smlal2 v0.4s, v28.8h, v31.8h\n"
+ "smlal v5.4s, v7.4h, v18.4h\n"
+ "smlal2 v30.4s, v7.8h, v18.8h\n"
+ "smlal v27.4s, v16.4h, v31.4h\n"
+ "smlal v1.4s, v7.4h, v31.4h\n"
+ "smlal2 v6.4s, v16.8h, v31.8h\n"
+ "smlal2 v25.4s, v7.8h, v31.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v29.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v4.4s\n"
+ "smlal v5.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "and v17.16b, v8.16b, v24.16b\n"
+ "sqrdmulh v27.4s, v27.4s, v29.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v28.16b, v0.16b, v22.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v16.16b, v27.16b, v24.16b\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v11.16b, v5.16b, v24.16b\n"
+ "sqadd v0.4s, v0.4s, v28.4s\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "and v12.16b, v19.16b, v14.16b\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v25.4s\n"
- "sshr v10.4s, v10.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v16.4s\n"
+ "and v18.16b, v6.16b, v22.16b\n"
"sshr v12.4s, v12.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v28.4s\n"
- "srshl v20.4s, v20.4s, v28.4s\n"
- "sqadd v5.4s, v5.4s, v4.4s\n"
- "srshl v24.4s, v24.4s, v28.4s\n"
- "sqadd v22.4s, v22.4s, v10.4s\n"
- "srshl v23.4s, v23.4s, v28.4s\n"
- "sqadd v19.4s, v19.4s, v12.4s\n"
- "srshl v15.4s, v15.4s, v14.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v14.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v14.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x17, x4]\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d20, [x16, x4]\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d24, [x15, x4]\n"
- "str d23, [x14, x4]\n"
- "add x4, x4, #0x8\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v19.16b, v30.16b, v22.16b\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v5.4s, v5.4s, v11.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v24.4s\n"
+ "srshl v27.4s, v27.4s, v24.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v24.4s\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d8, [x8, x3]\n"
+ "str d27, [x17, x3]\n"
+ "str d1, [x16, x3]\n"
+ "str d5, [x15, x3]\n"
+ "add x3, x3, #0x8\n"
"beq 124f\n"
- "add x6, x6, #0xc8\n"
+ "add x5, x5, #0xc8\n"
"3:" // Oddments
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x1, #2, 5f\n"
- "ld1 { v7.4s }, [x20], #0x10\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
"tbz x1, #1, 4f\n"
- "ld1 { v15.d }[0], [x20], #0x8\n"
+ "ld1 { v0.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[2], [x20]\n"
+ "ld1 { v0.s }[2], [x20]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v15.s }[0], [x20]\n"
+ "ld1 { v0.s }[0], [x20]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x1, #1, 6f\n"
- "ld1 { v7.d }[0], [x20], #0x8\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[2], [x20]\n"
+ "ld1 { v8.s }[2], [x20]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 7f\n"
- "ld1 { v7.s }[0], [x20]\n"
+ "ld1 { v8.s }[0], [x20]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "ldr d6, [x6, #0x0]\n"
- "ldr d14, [x6, #0x8]\n"
- "mov v20.16b, v7.16b\n"
- "mov v5.16b, v15.16b\n"
- "ldr d10, [x6, #0x10]\n"
- "ldr d21, [x6, #0x18]\n"
- "mov v24.16b, v7.16b\n"
- "mov v22.16b, v15.16b\n"
- "ldr d12, [x6, #0x20]\n"
- "ldp x9, x28, [x5, #0x0]\n"
- "mov v23.16b, v7.16b\n"
- "mov v19.16b, v15.16b\n"
- "ldp x27, x26, [x5, #0x10]\n"
- "ldp x25, x24, [x5, #0x20]\n"
- "ssubl v6.8h, v6.8b, v13.8b\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldp x23, x22, [x5, #0x30]\n"
- "ldp x21, x20, [x5, #0x40]\n"
- "ssubl v10.8h, v10.8b, v13.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "add x9, x9, x3\n"
- "add x28, x28, x3\n"
- "add x27, x27, x3\n"
- "add x26, x26, x3\n"
- "add x25, x25, x3\n"
- "add x24, x24, x3\n"
- "add x23, x23, x3\n"
- "add x22, x22, x3\n"
- "add x21, x21, x3\n"
- "add x20, x20, x3\n"
+ "ldr d31, [x5, #0x0]\n"
+ "ldr d12, [x5, #0x8]\n"
+ "mov v27.16b, v8.16b\n"
+ "mov v6.16b, v0.16b\n"
+ "ldr d17, [x5, #0x10]\n"
+ "ldr d11, [x5, #0x18]\n"
+ "mov v1.16b, v8.16b\n"
+ "mov v25.16b, v0.16b\n"
+ "ldr d23, [x5, #0x20]\n"
+ "ldp x9, x28, [x4, #0x0]\n"
+ "mov v5.16b, v8.16b\n"
+ "mov v30.16b, v0.16b\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "ldp x27, x26, [x4, #0x10]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ldp x25, x24, [x4, #0x20]\n"
+ "add x9, x9, x2\n"
+ "add x28, x28, x2\n"
+ "add x27, x27, x2\n"
+ "add x26, x26, x2\n"
+ "ldp x23, x22, [x4, #0x30]\n"
+ "add x25, x25, x2\n"
+ "add x24, x24, x2\n"
+ "ldp x21, x20, [x4, #0x40]\n"
+ "add x23, x23, x2\n"
+ "add x22, x22, x2\n"
+ "add x21, x21, x2\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 9f\n"
- "ld1 { v31.s }[0], [x9], #0x4\n"
- "ld1 { v17.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
- "ld1 { v16.s }[0], [x26], #0x4\n"
- "ld1 { v3.s }[0], [x25], #0x4\n"
- "ld1 { v4.s }[0], [x24], #0x4\n"
- "ld1 { v25.s }[0], [x23], #0x4\n"
- "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x9], #0x4\n"
+ "ld1 { v21.s }[0], [x28], #0x4\n"
+ "ld1 { v16.s }[0], [x27], #0x4\n"
+ "ld1 { v20.s }[0], [x26], #0x4\n"
+ "ld1 { v7.s }[0], [x25], #0x4\n"
+ "ld1 { v19.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v26.s }[0], [x22], #0x4\n"
"ld1 { v29.s }[0], [x21], #0x4\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 8f\n"
- "ld1 { v31.h }[2], [x9], #0x2\n"
- "ld1 { v17.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v16.h }[2], [x26], #0x2\n"
- "ld1 { v3.h }[2], [x25], #0x2\n"
- "ld1 { v4.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v21.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v20.h }[2], [x26], #0x2\n"
+ "ld1 { v7.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
"ld1 { v29.h }[2], [x21], #0x2\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[6], [x9]\n"
- "ld1 { v17.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v16.b }[6], [x26]\n"
- "ld1 { v3.b }[6], [x25]\n"
- "ld1 { v4.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x9]\n"
+ "ld1 { v21.b }[6], [x28]\n"
+ "ld1 { v16.b }[6], [x27]\n"
+ "ld1 { v20.b }[6], [x26]\n"
+ "ld1 { v7.b }[6], [x25]\n"
+ "ld1 { v19.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v29.b }[6], [x21]\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[4], [x9]\n"
- "ld1 { v17.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v16.b }[4], [x26]\n"
- "ld1 { v3.b }[4], [x25]\n"
- "ld1 { v4.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x9]\n"
+ "ld1 { v21.b }[4], [x28]\n"
+ "ld1 { v16.b }[4], [x27]\n"
+ "ld1 { v20.b }[4], [x26]\n"
+ "ld1 { v7.b }[4], [x25]\n"
+ "ld1 { v19.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
"ld1 { v29.b }[4], [x21]\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
"tbz x1, #1, 10f\n"
- "ld1 { v31.h }[0], [x9], #0x2\n"
- "ld1 { v17.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
- "ld1 { v16.h }[0], [x26], #0x2\n"
- "ld1 { v3.h }[0], [x25], #0x2\n"
- "ld1 { v4.h }[0], [x24], #0x2\n"
- "ld1 { v25.h }[0], [x23], #0x2\n"
- "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x9], #0x2\n"
+ "ld1 { v21.h }[0], [x28], #0x2\n"
+ "ld1 { v16.h }[0], [x27], #0x2\n"
+ "ld1 { v20.h }[0], [x26], #0x2\n"
+ "ld1 { v7.h }[0], [x25], #0x2\n"
+ "ld1 { v19.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v26.h }[0], [x22], #0x2\n"
"ld1 { v29.h }[0], [x21], #0x2\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[2], [x9]\n"
- "ld1 { v17.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v16.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x25]\n"
- "ld1 { v4.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x9]\n"
+ "ld1 { v21.b }[2], [x28]\n"
+ "ld1 { v16.b }[2], [x27]\n"
+ "ld1 { v20.b }[2], [x26]\n"
+ "ld1 { v7.b }[2], [x25]\n"
+ "ld1 { v19.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v29.b }[2], [x21]\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 11f\n"
- "ld1 { v31.b }[0], [x9]\n"
- "ld1 { v17.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
- "ld1 { v16.b }[0], [x26]\n"
- "ld1 { v3.b }[0], [x25]\n"
- "ld1 { v4.b }[0], [x24]\n"
- "ld1 { v25.b }[0], [x23]\n"
- "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x9]\n"
+ "ld1 { v21.b }[0], [x28]\n"
+ "ld1 { v16.b }[0], [x27]\n"
+ "ld1 { v20.b }[0], [x26]\n"
+ "ld1 { v7.b }[0], [x25]\n"
+ "ld1 { v19.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v26.b }[0], [x22]\n"
"ld1 { v29.b }[0], [x21]\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v18.8b\n"
- "usubl v17.8h, v17.8b, v18.8b\n"
- "smlal v7.4s, v31.4h, v6.4h\n"
- "ldr x20, [x5, #0x50]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "smlal2 v15.4s, v31.8h, v6.8h\n"
- "smlal v20.4s, v17.4h, v6.4h\n"
- "smlal2 v5.4s, v17.8h, v6.8h\n"
- "smlal v24.4s, v30.4h, v6.4h\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "add x20, x20, x3\n"
- "smlal2 v22.4s, v30.8h, v6.8h\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "smlal v23.4s, v16.4h, v6.4h\n"
- "smlal2 v19.4s, v16.8h, v6.8h\n"
- "smlal v7.4s, v17.4h, v14.4h\n"
- "usubl v4.8h, v4.8b, v18.8b\n"
- "smlal2 v15.4s, v17.8h, v14.8h\n"
- "smlal v20.4s, v3.4h, v14.4h\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "smlal2 v5.4s, v3.8h, v14.8h\n"
- "smlal v24.4s, v16.4h, v14.4h\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal2 v22.4s, v16.8h, v14.8h\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "smlal v23.4s, v4.4h, v14.4h\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "smlal2 v19.4s, v4.8h, v14.8h\n"
- "smlal v7.4s, v3.4h, v10.4h\n"
- "smlal2 v15.4s, v3.8h, v10.8h\n"
- "smlal v20.4s, v25.4h, v10.4h\n"
- "smlal2 v5.4s, v25.8h, v10.8h\n"
- "smlal v24.4s, v4.4h, v10.4h\n"
- "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x50]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "smlal v8.4s, v24.4h, v31.4h\n"
+ "smlal2 v0.4s, v24.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "add x20, x20, x2\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v16.4h, v31.4h\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "smlal2 v25.4s, v16.8h, v31.8h\n"
+ "smlal v5.4s, v20.4h, v31.4h\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "smlal2 v30.4s, v20.8h, v31.8h\n"
+ "smlal v8.4s, v21.4h, v12.4h\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "smlal2 v0.4s, v21.8h, v12.8h\n"
+ "smlal v27.4s, v7.4h, v12.4h\n"
+ "smlal2 v6.4s, v7.8h, v12.8h\n"
+ "smlal v1.4s, v20.4h, v12.4h\n"
+ "smlal2 v25.4s, v20.8h, v12.8h\n"
+ "smlal v5.4s, v19.4h, v12.4h\n"
+ "smlal2 v30.4s, v19.8h, v12.8h\n"
+ "smlal v8.4s, v7.4h, v17.4h\n"
+ "smlal2 v0.4s, v7.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 13f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
"tbz x1, #1, 12f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v4.b }[6], [x20]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v4.b }[4], [x20]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
"tbz x1, #1, 14f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v4.b }[2], [x20]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 15f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v4.b }[0], [x20]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v18.8b\n"
- "ldr x20, [x5, #0x58]\n"
- "smlal v23.4s, v27.4h, v10.4h\n"
- "smlal2 v19.4s, v27.8h, v10.8h\n"
- "smlal v7.4s, v25.4h, v21.4h\n"
- "smlal2 v15.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v20.4s, v9.4h, v21.4h\n"
- "smlal2 v5.4s, v9.8h, v21.8h\n"
- "smlal v24.4s, v27.4h, v21.4h\n"
- "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x20, [x4, #0x58]\n"
+ "smlal v8.4s, v28.4h, v11.4h\n"
+ "smlal2 v0.4s, v28.8h, v11.8h\n"
+ "smlal v27.4s, v26.4h, v11.4h\n"
+ "smlal2 v6.4s, v26.8h, v11.8h\n"
+ "smlal v5.4s, v4.4h, v17.4h\n"
+ "smlal2 v30.4s, v4.8h, v17.8h\n"
+ "smlal v1.4s, v4.4h, v11.4h\n"
+ "smlal2 v25.4s, v4.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 17f\n"
- "ld1 { v6.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 16f\n"
- "ld1 { v6.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
"tbz x1, #1, 18f\n"
- "ld1 { v6.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 19f\n"
- "ld1 { v6.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v6.8h, v6.8b, v18.8b\n"
- "ldr x20, [x5, #0x60]\n"
- "smlal v23.4s, v6.4h, v21.4h\n"
- "smlal2 v19.4s, v6.8h, v21.8h\n"
- "smlal v7.4s, v9.4h, v12.4h\n"
- "smlal2 v15.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0x60]\n"
+ "smlal v8.4s, v26.4h, v23.4h\n"
+ "smlal2 v0.4s, v26.8h, v23.8h\n"
+ "smlal v5.4s, v21.4h, v11.4h\n"
+ "smlal2 v30.4s, v21.8h, v11.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 21f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 20f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
"tbz x1, #1, 22f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 23f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "ldr d14, [x6, #0x28]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v6.4h, v12.4h\n"
- "smlal2 v22.4s, v6.8h, v12.8h\n"
- "ssubl v14.8h, v14.8b, v13.8b\n"
- "ldr x20, [x5, #0x68]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v14.4h\n"
- "smlal2 v15.4s, v30.8h, v14.8h\n"
- "smlal v20.4s, v16.4h, v14.4h\n"
- "smlal2 v5.4s, v16.8h, v14.8h\n"
- "smlal v24.4s, v28.4h, v14.4h\n"
- "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "ldr d11, [x5, #0x28]\n"
+ "usubl v31.8h, v31.8b, v15.8b\n"
+ "smlal v1.4s, v21.4h, v23.4h\n"
+ "smlal2 v25.4s, v21.8h, v23.8h\n"
+ "ldr x20, [x4, #0x68]\n"
+ "smlal v5.4s, v29.4h, v23.4h\n"
+ "smlal2 v30.4s, v29.8h, v23.8h\n"
+ "smlal v27.4s, v31.4h, v23.4h\n"
+ "smlal2 v6.4s, v31.8h, v23.8h\n"
+ "ssubl v11.8h, v11.8b, v9.8b\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "smlal v1.4s, v18.4h, v11.4h\n"
+ "smlal2 v25.4s, v18.8h, v11.8h\n"
+ "smlal v27.4s, v20.4h, v11.4h\n"
+ "smlal2 v6.4s, v20.8h, v11.8h\n"
"tbz x1, #2, 25f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
"tbz x1, #1, 24f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
"tbz x1, #1, 26f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 27f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x20]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d21, [x6, #0x30]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x70]\n"
- "smlal v23.4s, v25.4h, v14.4h\n"
- "smlal2 v19.4s, v25.8h, v14.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v21.4h\n"
- "smlal2 v15.4s, v16.8h, v21.8h\n"
- "smlal v20.4s, v4.4h, v21.4h\n"
- "smlal2 v5.4s, v4.8h, v21.8h\n"
- "smlal v24.4s, v25.4h, v21.4h\n"
- "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ldr d3, [x5, #0x30]\n"
+ "usubl v24.8h, v24.8b, v15.8b\n"
+ "ldr x20, [x4, #0x70]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v24.4h, v11.4h\n"
+ "smlal2 v30.4s, v24.8h, v11.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v3.4h\n"
+ "smlal2 v0.4s, v20.8h, v3.8h\n"
+ "smlal v27.4s, v19.4h, v3.4h\n"
+ "smlal2 v6.4s, v19.8h, v3.8h\n"
+ "smlal v1.4s, v24.4h, v3.4h\n"
+ "smlal2 v25.4s, v24.8h, v3.8h\n"
"tbz x1, #2, 29f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
"tbz x1, #1, 28f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v2.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v2.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
"tbz x1, #1, 30f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v2.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 31f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v2.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d9, [x6, #0x38]\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0x78]\n"
- "smlal v23.4s, v10.4h, v21.4h\n"
- "smlal2 v19.4s, v10.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v4.4h, v9.4h\n"
- "smlal2 v15.4s, v4.8h, v9.8h\n"
- "smlal v20.4s, v27.4h, v9.4h\n"
- "smlal2 v5.4s, v27.8h, v9.8h\n"
- "smlal v24.4s, v10.4h, v9.4h\n"
- "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "ldr d22, [x5, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x20, [x4, #0x78]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v2.4h, v3.4h\n"
+ "smlal2 v30.4s, v2.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v22.4h\n"
+ "smlal2 v0.4s, v19.8h, v22.8h\n"
+ "smlal v27.4s, v4.4h, v22.4h\n"
+ "smlal2 v6.4s, v4.8h, v22.8h\n"
+ "smlal v1.4s, v2.4h, v22.4h\n"
+ "smlal2 v25.4s, v2.8h, v22.8h\n"
"tbz x1, #2, 33f\n"
- "ld1 { v12.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 32f\n"
- "ld1 { v12.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
"tbz x1, #1, 34f\n"
- "ld1 { v12.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 35f\n"
- "ld1 { v12.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d31, [x6, #0x40]\n"
- "usubl v12.8h, v12.8b, v18.8b\n"
- "ssubl v31.8h, v31.8b, v13.8b\n"
- "ldr x20, [x5, #0x80]\n"
- "smlal v23.4s, v12.4h, v9.4h\n"
- "smlal2 v19.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v27.4h, v31.4h\n"
- "smlal2 v15.4s, v27.8h, v31.8h\n"
- "smlal v20.4s, v6.4h, v31.4h\n"
- "smlal2 v5.4s, v6.8h, v31.8h\n"
- "smlal v24.4s, v12.4h, v31.4h\n"
- "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "ldr d31, [x5, #0x40]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0x80]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v22.4h\n"
+ "smlal2 v30.4s, v26.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v4.4h, v31.4h\n"
+ "smlal2 v0.4s, v4.8h, v31.8h\n"
+ "smlal v27.4s, v21.4h, v31.4h\n"
+ "smlal2 v6.4s, v21.8h, v31.8h\n"
+ "smlal v1.4s, v26.4h, v31.4h\n"
+ "smlal2 v25.4s, v26.8h, v31.8h\n"
"tbz x1, #2, 37f\n"
- "ld1 { v8.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 36f\n"
- "ld1 { v8.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
"tbz x1, #1, 38f\n"
- "ld1 { v8.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 39f\n"
- "ld1 { v8.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d16, [x6, #0x48]\n"
- "usubl v8.8h, v8.8b, v18.8b\n"
- "ssubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0x88]\n"
- "smlal v23.4s, v8.4h, v31.4h\n"
- "smlal2 v19.4s, v8.8h, v31.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v6.4h, v16.4h\n"
- "smlal2 v15.4s, v6.8h, v16.8h\n"
- "smlal v20.4s, v29.4h, v16.4h\n"
- "smlal2 v5.4s, v29.8h, v16.8h\n"
- "smlal v24.4s, v8.4h, v16.4h\n"
- "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "ldr d17, [x5, #0x48]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0x88]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v31.4h\n"
+ "smlal2 v30.4s, v28.8h, v31.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v29.4h, v17.4h\n"
+ "smlal2 v6.4s, v29.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 41f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v7.s }[0], [x20], #0x4\n"
"tbz x1, #1, 40f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v7.h }[2], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v7.b }[6], [x20]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v7.b }[4], [x20]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
"tbz x1, #1, 42f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v7.h }[0], [x20], #0x2\n"
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v7.b }[2], [x20]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 43f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v7.b }[0], [x20]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d21, [x6, #0x50]\n"
- "usubl v27.8h, v27.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0x90]\n"
- "smlal v23.4s, v27.4h, v16.4h\n"
- "smlal2 v19.4s, v27.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v21.4h\n"
- "smlal2 v15.4s, v28.8h, v21.8h\n"
- "smlal v20.4s, v25.4h, v21.4h\n"
- "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "ldr d22, [x5, #0x50]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr x20, [x4, #0x90]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v7.4h, v17.4h\n"
+ "smlal2 v30.4s, v7.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v22.4h\n"
+ "smlal2 v0.4s, v18.8h, v22.8h\n"
+ "smlal v27.4s, v24.4h, v22.4h\n"
+ "smlal2 v6.4s, v24.8h, v22.8h\n"
"tbz x1, #2, 45f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 44f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
"tbz x1, #1, 46f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 47f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v18.8b\n"
- "ldr x20, [x5, #0x98]\n"
- "smlal v24.4s, v31.4h, v21.4h\n"
- "smlal2 v22.4s, v31.8h, v21.8h\n"
- "add x20, x20, x3\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0x98]\n"
+ "smlal v1.4s, v20.4h, v22.4h\n"
+ "smlal2 v25.4s, v20.8h, v22.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 49f\n"
- "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 48f\n"
- "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
"tbz x1, #1, 50f\n"
- "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 51f\n"
- "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d2, [x6, #0x58]\n"
- "usubl v28.8h, v28.8b, v18.8b\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr x20, [x5, #0xa0]\n"
- "smlal v23.4s, v28.4h, v21.4h\n"
- "smlal2 v19.4s, v28.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v2.4h\n"
- "smlal2 v15.4s, v25.8h, v2.8h\n"
- "smlal v20.4s, v10.4h, v2.4h\n"
- "smlal2 v5.4s, v10.8h, v2.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ldr d17, [x5, #0x58]\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v22.4h\n"
+ "smlal2 v30.4s, v19.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v24.4h, v17.4h\n"
+ "smlal2 v0.4s, v24.8h, v17.8h\n"
+ "smlal v27.4s, v2.4h, v17.4h\n"
+ "smlal2 v6.4s, v2.8h, v17.8h\n"
+ "smlal v1.4s, v19.4h, v17.4h\n"
+ "smlal2 v25.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 53f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x1, #1, 52f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v29.b }[6], [x20]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v29.b }[4], [x20]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
"tbz x1, #1, 54f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v29.b }[2], [x20]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 55f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v29.b }[0], [x20]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d25, [x6, #0x60]\n"
- "usubl v21.8h, v21.8b, v18.8b\n"
- "ssubl v25.8h, v25.8b, v13.8b\n"
- "ldr x20, [x5, #0xa8]\n"
- "smlal v23.4s, v21.4h, v2.4h\n"
- "smlal2 v19.4s, v21.8h, v2.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v10.4h, v25.4h\n"
- "smlal2 v15.4s, v10.8h, v25.8h\n"
- "smlal v20.4s, v12.4h, v25.4h\n"
- "smlal2 v5.4s, v12.8h, v25.8h\n"
- "smlal v24.4s, v21.4h, v25.4h\n"
- "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "ldr d24, [x5, #0x60]\n"
+ "usubl v29.8h, v29.8b, v15.8b\n"
+ "ldr x20, [x4, #0xa8]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v5.4s, v29.4h, v17.4h\n"
+ "smlal2 v30.4s, v29.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v2.4h, v24.4h\n"
+ "smlal2 v0.4s, v2.8h, v24.8h\n"
+ "smlal v27.4s, v26.4h, v24.4h\n"
+ "smlal2 v6.4s, v26.8h, v24.8h\n"
+ "smlal v1.4s, v29.4h, v24.4h\n"
+ "smlal2 v25.4s, v29.8h, v24.8h\n"
"tbz x1, #2, 57f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
"tbz x1, #1, 56f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x20]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x20]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
"tbz x1, #1, 58f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x20]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 59f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v31.b }[0], [x20]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d1, [x6, #0x68]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr x20, [x5, #0xb0]\n"
- "smlal v23.4s, v9.4h, v25.4h\n"
- "smlal2 v19.4s, v9.8h, v25.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v12.4h, v1.4h\n"
- "smlal2 v15.4s, v12.8h, v1.8h\n"
- "smlal v20.4s, v8.4h, v1.4h\n"
- "smlal2 v5.4s, v8.8h, v1.8h\n"
- "smlal v24.4s, v9.4h, v1.4h\n"
- "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "ldr d17, [x5, #0x68]\n"
+ "usubl v31.8h, v31.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v31.4h, v24.4h\n"
+ "smlal2 v30.4s, v31.8h, v24.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v17.4h\n"
+ "smlal2 v0.4s, v26.8h, v17.8h\n"
+ "smlal v27.4s, v28.4h, v17.4h\n"
+ "smlal2 v6.4s, v28.8h, v17.8h\n"
+ "smlal v1.4s, v31.4h, v17.4h\n"
+ "smlal2 v25.4s, v31.8h, v17.8h\n"
"tbz x1, #2, 61f\n"
- "ld1 { v3.s }[0], [x20], #0x4\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
"tbz x1, #1, 60f\n"
- "ld1 { v3.h }[2], [x20], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[6], [x20]\n"
+ "ld1 { v21.b }[6], [x20]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[4], [x20]\n"
+ "ld1 { v21.b }[4], [x20]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
"tbz x1, #1, 62f\n"
- "ld1 { v3.h }[0], [x20], #0x2\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[2], [x20]\n"
+ "ld1 { v21.b }[2], [x20]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 63f\n"
- "ld1 { v3.b }[0], [x20]\n"
+ "ld1 { v21.b }[0], [x20]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d16, [x6, #0x70]\n"
- "usubl v3.8h, v3.8b, v18.8b\n"
- "ssubl v16.8h, v16.8b, v13.8b\n"
- "ldr x20, [x5, #0xb8]\n"
- "smlal v23.4s, v3.4h, v1.4h\n"
- "smlal2 v19.4s, v3.8h, v1.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v8.4h, v16.4h\n"
- "smlal2 v15.4s, v8.8h, v16.8h\n"
- "smlal v20.4s, v27.4h, v16.4h\n"
- "smlal2 v5.4s, v27.8h, v16.8h\n"
- "smlal v24.4s, v3.4h, v16.4h\n"
- "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "ldr d22, [x5, #0x70]\n"
+ "usubl v21.8h, v21.8b, v15.8b\n"
+ "ldr x20, [x4, #0xb8]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v21.4h, v17.4h\n"
+ "smlal2 v30.4s, v21.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v22.4h\n"
+ "smlal2 v0.4s, v28.8h, v22.8h\n"
+ "smlal v27.4s, v7.4h, v22.4h\n"
+ "smlal2 v6.4s, v7.8h, v22.8h\n"
+ "smlal v1.4s, v21.4h, v22.4h\n"
+ "smlal2 v25.4s, v21.8h, v22.8h\n"
"tbz x1, #2, 65f\n"
- "ld1 { v14.s }[0], [x20], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
"tbz x1, #1, 64f\n"
- "ld1 { v14.h }[2], [x20], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[6], [x20]\n"
+ "ld1 { v11.b }[6], [x20]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[4], [x20]\n"
+ "ld1 { v11.b }[4], [x20]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
"tbz x1, #1, 66f\n"
- "ld1 { v14.h }[0], [x20], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[2], [x20]\n"
+ "ld1 { v11.b }[2], [x20]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 67f\n"
- "ld1 { v14.b }[0], [x20]\n"
+ "ld1 { v11.b }[0], [x20]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d17, [x6, #0x78]\n"
- "usubl v14.8h, v14.8b, v18.8b\n"
- "ssubl v17.8h, v17.8b, v13.8b\n"
- "ldr x20, [x5, #0xc0]\n"
- "smlal v23.4s, v14.4h, v16.4h\n"
- "smlal2 v19.4s, v14.8h, v16.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v31.4h, v17.4h\n"
- "smlal2 v15.4s, v31.8h, v17.8h\n"
- "smlal v20.4s, v28.4h, v17.4h\n"
- "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "ldr d17, [x5, #0x78]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc0]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v11.4h, v22.4h\n"
+ "smlal2 v30.4s, v11.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v17.4h\n"
+ "smlal2 v0.4s, v20.8h, v17.8h\n"
+ "smlal v27.4s, v19.4h, v17.4h\n"
+ "smlal2 v6.4s, v19.8h, v17.8h\n"
"tbz x1, #2, 69f\n"
- "ld1 { v1.s }[0], [x20], #0x4\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
"tbz x1, #1, 68f\n"
- "ld1 { v1.h }[2], [x20], #0x2\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[6], [x20]\n"
+ "ld1 { v18.b }[6], [x20]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[4], [x20]\n"
+ "ld1 { v18.b }[4], [x20]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
"tbz x1, #1, 70f\n"
- "ld1 { v1.h }[0], [x20], #0x2\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x20]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 71f\n"
- "ld1 { v1.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x20]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v1.8h, v1.8b, v18.8b\n"
- "ldr x20, [x5, #0xc8]\n"
- "smlal v24.4s, v1.4h, v17.4h\n"
- "smlal2 v22.4s, v1.8h, v17.8h\n"
- "add x20, x20, x3\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "ldr x20, [x4, #0xc8]\n"
+ "smlal v1.4s, v18.4h, v17.4h\n"
+ "smlal2 v25.4s, v18.8h, v17.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 73f\n"
- "ld1 { v16.s }[0], [x20], #0x4\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
"tbz x1, #1, 72f\n"
- "ld1 { v16.h }[2], [x20], #0x2\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[6], [x20]\n"
+ "ld1 { v20.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[4], [x20]\n"
+ "ld1 { v20.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
"tbz x1, #1, 74f\n"
- "ld1 { v16.h }[0], [x20], #0x2\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[2], [x20]\n"
+ "ld1 { v20.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 75f\n"
- "ld1 { v16.b }[0], [x20]\n"
+ "ld1 { v20.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d29, [x6, #0x80]\n"
- "usubl v16.8h, v16.8b, v18.8b\n"
- "ssubl v29.8h, v29.8b, v13.8b\n"
- "ldr x20, [x5, #0xd0]\n"
- "smlal v23.4s, v16.4h, v17.4h\n"
- "smlal2 v19.4s, v16.8h, v17.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v28.4h, v29.4h\n"
- "smlal2 v15.4s, v28.8h, v29.8h\n"
- "smlal v20.4s, v21.4h, v29.4h\n"
- "smlal2 v5.4s, v21.8h, v29.8h\n"
- "smlal v24.4s, v16.4h, v29.4h\n"
- "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "ldr d4, [x5, #0x80]\n"
+ "usubl v20.8h, v20.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd0]\n"
+ "ssubl v4.8h, v4.8b, v9.8b\n"
+ "smlal v5.4s, v20.4h, v17.4h\n"
+ "smlal2 v30.4s, v20.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v19.4h, v4.4h\n"
+ "smlal2 v0.4s, v19.8h, v4.8h\n"
+ "smlal v27.4s, v29.4h, v4.4h\n"
+ "smlal2 v6.4s, v29.8h, v4.8h\n"
+ "smlal v1.4s, v20.4h, v4.4h\n"
+ "smlal2 v25.4s, v20.8h, v4.8h\n"
"tbz x1, #2, 77f\n"
- "ld1 { v30.s }[0], [x20], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
"tbz x1, #1, 76f\n"
- "ld1 { v30.h }[2], [x20], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[6], [x20]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[4], [x20]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
"tbz x1, #1, 78f\n"
- "ld1 { v30.h }[0], [x20], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[2], [x20]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 79f\n"
- "ld1 { v30.b }[0], [x20]\n"
+ "ld1 { v26.b }[0], [x20]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d12, [x6, #0x88]\n"
- "usubl v30.8h, v30.8b, v18.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0xd8]\n"
- "smlal v23.4s, v30.4h, v29.4h\n"
- "smlal2 v19.4s, v30.8h, v29.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v21.4h, v12.4h\n"
- "smlal2 v15.4s, v21.8h, v12.8h\n"
- "smlal v20.4s, v9.4h, v12.4h\n"
- "smlal2 v5.4s, v9.8h, v12.8h\n"
- "smlal v24.4s, v30.4h, v12.4h\n"
- "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "ldr d17, [x5, #0x88]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr x20, [x4, #0xd8]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v26.4h, v4.4h\n"
+ "smlal2 v30.4s, v26.8h, v4.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v29.4h, v17.4h\n"
+ "smlal2 v0.4s, v29.8h, v17.8h\n"
+ "smlal v27.4s, v31.4h, v17.4h\n"
+ "smlal2 v6.4s, v31.8h, v17.8h\n"
+ "smlal v1.4s, v26.4h, v17.4h\n"
+ "smlal2 v25.4s, v26.8h, v17.8h\n"
"tbz x1, #2, 81f\n"
- "ld1 { v29.s }[0], [x20], #0x4\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
"tbz x1, #1, 80f\n"
- "ld1 { v29.h }[2], [x20], #0x2\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[6], [x20]\n"
+ "ld1 { v23.b }[6], [x20]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[4], [x20]\n"
+ "ld1 { v23.b }[4], [x20]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
"tbz x1, #1, 82f\n"
- "ld1 { v29.h }[0], [x20], #0x2\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[2], [x20]\n"
+ "ld1 { v23.b }[2], [x20]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 83f\n"
- "ld1 { v29.b }[0], [x20]\n"
+ "ld1 { v23.b }[0], [x20]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d21, [x6, #0x90]\n"
- "usubl v29.8h, v29.8b, v18.8b\n"
- "ssubl v21.8h, v21.8b, v13.8b\n"
- "ldr x20, [x5, #0xe0]\n"
- "smlal v23.4s, v29.4h, v12.4h\n"
- "smlal2 v19.4s, v29.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v9.4h, v21.4h\n"
- "smlal2 v15.4s, v9.8h, v21.8h\n"
- "smlal v20.4s, v3.4h, v21.4h\n"
- "smlal2 v5.4s, v3.8h, v21.8h\n"
- "smlal v24.4s, v29.4h, v21.4h\n"
- "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "ldr d22, [x5, #0x90]\n"
+ "usubl v23.8h, v23.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe0]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal v5.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v31.4h, v22.4h\n"
+ "smlal2 v0.4s, v31.8h, v22.8h\n"
+ "smlal v27.4s, v21.4h, v22.4h\n"
+ "smlal2 v6.4s, v21.8h, v22.8h\n"
+ "smlal v1.4s, v23.4h, v22.4h\n"
+ "smlal2 v25.4s, v23.8h, v22.8h\n"
"tbz x1, #2, 85f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
"tbz x1, #1, 84f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v28.b }[6], [x20]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v28.b }[4], [x20]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
"tbz x1, #1, 86f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v28.b }[2], [x20]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 87f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v28.b }[0], [x20]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d8, [x6, #0x98]\n"
- "usubl v25.8h, v25.8b, v18.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0xe8]\n"
- "smlal v23.4s, v25.4h, v21.4h\n"
- "smlal2 v19.4s, v25.8h, v21.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v3.4h, v8.4h\n"
- "smlal2 v15.4s, v3.8h, v8.8h\n"
- "smlal v20.4s, v14.4h, v8.4h\n"
- "smlal2 v5.4s, v14.8h, v8.8h\n"
- "smlal v24.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "ldr d17, [x5, #0x98]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldr x20, [x4, #0xe8]\n"
+ "ssubl v17.8h, v17.8b, v9.8b\n"
+ "smlal v5.4s, v28.4h, v22.4h\n"
+ "smlal2 v30.4s, v28.8h, v22.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v21.4h, v17.4h\n"
+ "smlal2 v0.4s, v21.8h, v17.8h\n"
+ "smlal v27.4s, v11.4h, v17.4h\n"
+ "smlal2 v6.4s, v11.8h, v17.8h\n"
+ "smlal v1.4s, v28.4h, v17.4h\n"
+ "smlal2 v25.4s, v28.8h, v17.8h\n"
"tbz x1, #2, 89f\n"
- "ld1 { v21.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
"tbz x1, #1, 88f\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x20]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x20]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
"tbz x1, #1, 90f\n"
- "ld1 { v21.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x20]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 91f\n"
- "ld1 { v21.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x20]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d9, [x6, #0xa0]\n"
- "usubl v21.8h, v21.8b, v18.8b\n"
- "ssubl v9.8h, v9.8b, v13.8b\n"
- "ldr x20, [x5, #0xf0]\n"
- "smlal v23.4s, v21.4h, v8.4h\n"
- "smlal2 v19.4s, v21.8h, v8.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v1.4h, v9.4h\n"
- "smlal2 v15.4s, v1.8h, v9.8h\n"
- "smlal v20.4s, v16.4h, v9.4h\n"
- "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "ldr d3, [x5, #0xa0]\n"
+ "usubl v16.8h, v16.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf0]\n"
+ "ssubl v3.8h, v3.8b, v9.8b\n"
+ "smlal v5.4s, v16.4h, v17.4h\n"
+ "smlal2 v30.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v18.4h, v3.4h\n"
+ "smlal2 v0.4s, v18.8h, v3.8h\n"
+ "smlal v27.4s, v20.4h, v3.4h\n"
+ "smlal2 v6.4s, v20.8h, v3.8h\n"
"tbz x1, #2, 93f\n"
"ld1 { v12.s }[0], [x20], #0x4\n"
"tbz x1, #1, 92f\n"
@@ -1871,308 +1871,308 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
"tbz x1, #0, 95f\n"
"ld1 { v12.b }[0], [x20]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "usubl v12.8h, v12.8b, v18.8b\n"
- "ldr x20, [x5, #0xf8]\n"
- "smlal v24.4s, v12.4h, v9.4h\n"
- "smlal2 v22.4s, v12.8h, v9.8h\n"
- "add x20, x20, x3\n"
+ "usubl v12.8h, v12.8b, v15.8b\n"
+ "ldr x20, [x4, #0xf8]\n"
+ "smlal v1.4s, v12.4h, v3.4h\n"
+ "smlal2 v25.4s, v12.8h, v3.8h\n"
+ "add x20, x20, x2\n"
"tbz x1, #2, 97f\n"
- "ld1 { v10.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 96f\n"
- "ld1 { v10.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
"tbz x1, #1, 98f\n"
- "ld1 { v10.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 99f\n"
- "ld1 { v10.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d12, [x6, #0xa8]\n"
- "usubl v10.8h, v10.8b, v18.8b\n"
- "ssubl v12.8h, v12.8b, v13.8b\n"
- "ldr x20, [x5, #0x100]\n"
- "smlal v23.4s, v10.4h, v9.4h\n"
- "smlal2 v19.4s, v10.8h, v9.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v16.4h, v12.4h\n"
- "smlal2 v15.4s, v16.8h, v12.8h\n"
- "smlal v20.4s, v30.4h, v12.4h\n"
- "smlal2 v5.4s, v30.8h, v12.8h\n"
- "smlal v24.4s, v10.4h, v12.4h\n"
- "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "ldr d18, [x5, #0xa8]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x100]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v3.4h\n"
+ "smlal2 v30.4s, v17.8h, v3.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v20.4h, v18.4h\n"
+ "smlal2 v0.4s, v20.8h, v18.8h\n"
+ "smlal v27.4s, v26.4h, v18.4h\n"
+ "smlal2 v6.4s, v26.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 101f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
"tbz x1, #1, 100f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v19.b }[6], [x20]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v19.b }[4], [x20]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
"tbz x1, #1, 102f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v19.b }[2], [x20]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 103f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v19.b }[0], [x20]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d28, [x6, #0xb0]\n"
- "usubl v9.8h, v9.8b, v18.8b\n"
- "ssubl v28.8h, v28.8b, v13.8b\n"
- "ldr x20, [x5, #0x108]\n"
- "smlal v23.4s, v9.4h, v12.4h\n"
- "smlal2 v19.4s, v9.8h, v12.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v30.4h, v28.4h\n"
- "smlal2 v15.4s, v30.8h, v28.8h\n"
- "smlal v20.4s, v29.4h, v28.4h\n"
- "smlal2 v5.4s, v29.8h, v28.8h\n"
- "smlal v24.4s, v9.4h, v28.4h\n"
- "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "ldr d12, [x5, #0xb0]\n"
+ "usubl v19.8h, v19.8b, v15.8b\n"
+ "ldr x20, [x4, #0x108]\n"
+ "ssubl v12.8h, v12.8b, v9.8b\n"
+ "smlal v5.4s, v19.4h, v18.4h\n"
+ "smlal2 v30.4s, v19.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v26.4h, v12.4h\n"
+ "smlal2 v0.4s, v26.8h, v12.8h\n"
+ "smlal v27.4s, v23.4h, v12.4h\n"
+ "smlal2 v6.4s, v23.8h, v12.8h\n"
+ "smlal v1.4s, v19.4h, v12.4h\n"
+ "smlal2 v25.4s, v19.8h, v12.8h\n"
"tbz x1, #2, 105f\n"
- "ld1 { v2.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 104f\n"
- "ld1 { v2.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
"tbz x1, #1, 106f\n"
- "ld1 { v2.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 107f\n"
- "ld1 { v2.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d30, [x6, #0xb8]\n"
- "usubl v2.8h, v2.8b, v18.8b\n"
- "ssubl v30.8h, v30.8b, v13.8b\n"
- "ldr x20, [x5, #0x110]\n"
- "smlal v23.4s, v2.4h, v28.4h\n"
- "smlal2 v19.4s, v2.8h, v28.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v29.4h, v30.4h\n"
- "smlal2 v15.4s, v29.8h, v30.8h\n"
- "smlal v20.4s, v25.4h, v30.4h\n"
- "smlal2 v5.4s, v25.8h, v30.8h\n"
- "smlal v24.4s, v2.4h, v30.4h\n"
- "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "ldr d18, [x5, #0xb8]\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "ldr x20, [x4, #0x110]\n"
+ "ssubl v18.8h, v18.8b, v9.8b\n"
+ "smlal v5.4s, v17.4h, v12.4h\n"
+ "smlal2 v30.4s, v17.8h, v12.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v23.4h, v18.4h\n"
+ "smlal2 v0.4s, v23.8h, v18.8h\n"
+ "smlal v27.4s, v28.4h, v18.4h\n"
+ "smlal2 v6.4s, v28.8h, v18.8h\n"
+ "smlal v1.4s, v17.4h, v18.4h\n"
+ "smlal2 v25.4s, v17.8h, v18.8h\n"
"tbz x1, #2, 109f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
"tbz x1, #1, 108f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "ld1 { v3.b }[6], [x20]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "ld1 { v3.b }[4], [x20]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
"tbz x1, #1, 110f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "ld1 { v3.b }[2], [x20]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 111f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "ld1 { v3.b }[0], [x20]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d8, [x6, #0xc0]\n"
- "usubl v27.8h, v27.8b, v18.8b\n"
- "ssubl v8.8h, v8.8b, v13.8b\n"
- "ldr x20, [x5, #0x118]\n"
- "smlal v23.4s, v27.4h, v30.4h\n"
- "smlal2 v19.4s, v27.8h, v30.8h\n"
- "add x20, x20, x3\n"
- "smlal v7.4s, v25.4h, v8.4h\n"
- "smlal2 v15.4s, v25.8h, v8.8h\n"
- "smlal v20.4s, v21.4h, v8.4h\n"
- "smlal2 v5.4s, v21.8h, v8.8h\n"
- "smlal v24.4s, v27.4h, v8.4h\n"
- "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d26, [x5, #0xc0]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x20, [x4, #0x118]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v5.4s, v3.4h, v18.4h\n"
+ "smlal2 v30.4s, v3.8h, v18.8h\n"
+ "add x20, x20, x2\n"
+ "smlal v8.4s, v28.4h, v26.4h\n"
+ "smlal2 v0.4s, v28.8h, v26.8h\n"
+ "smlal v27.4s, v16.4h, v26.4h\n"
+ "smlal2 v6.4s, v16.8h, v26.8h\n"
+ "smlal v1.4s, v3.4h, v26.4h\n"
+ "smlal2 v25.4s, v3.8h, v26.8h\n"
"tbz x1, #2, 113f\n"
- "ld1 { v9.s }[0], [x20], #0x4\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
"tbz x1, #1, 112f\n"
- "ld1 { v9.h }[2], [x20], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[6], [x20]\n"
+ "ld1 { v17.b }[6], [x20]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[4], [x20]\n"
+ "ld1 { v17.b }[4], [x20]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
"tbz x1, #1, 114f\n"
- "ld1 { v9.h }[0], [x20], #0x2\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[2], [x20]\n"
+ "ld1 { v17.b }[2], [x20]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 115f\n"
- "ld1 { v9.b }[0], [x20]\n"
+ "ld1 { v17.b }[0], [x20]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v9.8h, v9.8b, v18.8b\n"
- "smlal v23.4s, v9.4h, v8.4h\n"
- "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "usubl v17.8h, v17.8b, v15.8b\n"
+ "smlal v5.4s, v17.4h, v26.4h\n"
+ "smlal2 v30.4s, v17.8h, v26.8h\n"
"tbz x1, #2, 117f\n"
- "ld1 { v30.4s }, [x7], #0x10\n"
- "ld1 { v12.4s }, [x8], #0x10\n"
+ "ld1 { v9.4s }, [x6], #0x10\n"
+ "ld1 { v20.4s }, [x7], #0x10\n"
"tbz x1, #1, 116f\n"
- "ld1 { v14.d }[0], [x7], #0x8\n"
- "ld1 { v27.d }[0], [x8], #0x8\n"
+ "ld1 { v18.d }[0], [x6], #0x8\n"
+ "ld1 { v3.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[2], [x7]\n"
- "ld1 { v27.s }[2], [x8]\n"
+ "ld1 { v18.s }[2], [x6]\n"
+ "ld1 { v3.s }[2], [x7]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v14.s }[0], [x7]\n"
- "ld1 { v27.s }[0], [x8]\n"
+ "ld1 { v18.s }[0], [x6]\n"
+ "ld1 { v3.s }[0], [x7]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
"tbz x1, #1, 118f\n"
- "ld1 { v30.d }[0], [x7], #0x8\n"
- "ld1 { v12.d }[0], [x8], #0x8\n"
+ "ld1 { v9.d }[0], [x6], #0x8\n"
+ "ld1 { v20.d }[0], [x7], #0x8\n"
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[2], [x7]\n"
- "ld1 { v12.s }[2], [x8]\n"
+ "ld1 { v9.s }[2], [x6]\n"
+ "ld1 { v20.s }[2], [x7]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 119f\n"
- "ld1 { v30.s }[0], [x7]\n"
- "ld1 { v12.s }[0], [x8]\n"
+ "ld1 { v9.s }[0], [x6]\n"
+ "ld1 { v20.s }[0], [x7]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v7.4s, v7.4s, v30.4s\n"
- "and v16.16b, v7.16b, v12.16b\n"
- "add x17, x17, x4\n"
- "add x16, x16, x4\n"
- "sqrdmulh v15.4s, v15.4s, v14.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add x15, x15, x4\n"
- "add x14, x14, x4\n"
- "and v2.16b, v15.16b, v27.16b\n"
- "sqrdmulh v20.4s, v20.4s, v30.4s\n"
- "sqrdmulh v24.4s, v24.4s, v30.4s\n"
- "sqrdmulh v23.4s, v23.4s, v30.4s\n"
- "sqadd v7.4s, v7.4s, v16.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "and v21.16b, v20.16b, v12.16b\n"
- "sqrdmulh v5.4s, v5.4s, v14.4s\n"
- "and v18.16b, v24.16b, v12.16b\n"
- "sqrdmulh v22.4s, v22.4s, v14.4s\n"
- "and v31.16b, v23.16b, v12.16b\n"
- "sqrdmulh v19.4s, v19.4s, v14.4s\n"
- "sqadd v15.4s, v15.4s, v2.4s\n"
- "sshr v21.4s, v21.4s, #0x1f\n"
- "and v9.16b, v5.16b, v27.16b\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "and v4.16b, v22.16b, v27.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v28.16b, v19.16b, v27.16b\n"
- "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v9.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "add x8, x8, x3\n"
+ "add x17, x17, x3\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v9.4s\n"
+ "add x16, x16, x3\n"
+ "add x15, x15, x3\n"
+ "sqrdmulh v5.4s, v5.4s, v9.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "and v17.16b, v8.16b, v20.16b\n"
+ "and v23.16b, v0.16b, v3.16b\n"
+ "and v9.16b, v27.16b, v20.16b\n"
+ "and v26.16b, v1.16b, v20.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v18.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
"sshr v9.4s, v9.4s, #0x1f\n"
- "sqadd v24.4s, v24.4s, v18.4s\n"
- "sshr v4.4s, v4.4s, #0x1f\n"
- "sqadd v23.4s, v23.4s, v31.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "srshl v7.4s, v7.4s, v12.4s\n"
- "srshl v20.4s, v20.4s, v12.4s\n"
- "sqadd v5.4s, v5.4s, v9.4s\n"
- "srshl v24.4s, v24.4s, v12.4s\n"
- "sqadd v22.4s, v22.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v12.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "srshl v15.4s, v15.4s, v27.4s\n"
- "sqxtn v7.4h, v7.4s\n"
- "srshl v5.4s, v5.4s, v27.4s\n"
- "sqxtn v20.4h, v20.4s\n"
- "srshl v22.4s, v22.4s, v27.4s\n"
- "sqxtn v24.4h, v24.4s\n"
- "srshl v19.4s, v19.4s, v27.4s\n"
- "sqxtn v23.4h, v23.4s\n"
- "sqxtn2 v7.8h, v15.4s\n"
- "sqxtn2 v20.8h, v5.4s\n"
- "sqxtn2 v24.8h, v22.4s\n"
- "sqxtn2 v23.8h, v19.4s\n"
- "sqadd v7.8h, v7.8h, v26.8h\n"
- "sqadd v20.8h, v20.8h, v26.8h\n"
- "sqadd v24.8h, v24.8h, v26.8h\n"
- "sqadd v23.8h, v23.8h, v26.8h\n"
- "smax v7.8h, v7.8h, v11.8h\n"
- "smax v20.8h, v20.8h, v11.8h\n"
- "smax v24.8h, v24.8h, v11.8h\n"
- "smax v23.8h, v23.8h, v11.8h\n"
- "smin v7.8h, v7.8h, v0.8h\n"
- "smin v20.8h, v20.8h, v0.8h\n"
- "smin v24.8h, v24.8h, v0.8h\n"
- "smin v23.8h, v23.8h, v0.8h\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "and v24.16b, v6.16b, v3.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v18.16b, v25.16b, v3.16b\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "and v17.16b, v5.16b, v20.16b\n"
+ "sqadd v0.4s, v0.4s, v23.4s\n"
+ "and v16.16b, v30.16b, v3.16b\n"
+ "sqadd v27.4s, v27.4s, v9.4s\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v1.4s, v1.4s, v26.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "sqadd v5.4s, v5.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v24.4s\n"
+ "srshl v1.4s, v1.4s, v20.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "srshl v5.4s, v5.4s, v20.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v25.4s, v25.4s, v3.4s\n"
+ "sqxtn v1.4h, v1.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "sqxtn2 v8.8h, v0.4s\n"
+ "sqxtn2 v27.8h, v6.4s\n"
+ "sqxtn2 v1.8h, v25.4s\n"
+ "sqxtn2 v5.8h, v30.4s\n"
+ "sqadd v8.8h, v8.8h, v13.8h\n"
+ "sqadd v27.8h, v27.8h, v13.8h\n"
+ "sqadd v1.8h, v1.8h, v13.8h\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "smax v8.8h, v8.8h, v10.8h\n"
+ "smax v27.8h, v27.8h, v10.8h\n"
+ "smax v1.8h, v1.8h, v10.8h\n"
+ "smax v5.8h, v5.8h, v10.8h\n"
+ "smin v8.8h, v8.8h, v14.8h\n"
+ "smin v27.8h, v27.8h, v14.8h\n"
+ "smin v1.8h, v1.8h, v14.8h\n"
+ "smin v5.8h, v5.8h, v14.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v1.16b, v1.16b, v1.16b\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
"tbz x1, #2, 121f\n"
- "st1 { v7.s }[0], [x17], #0x4\n"
- "st1 { v20.s }[0], [x16], #0x4\n"
- "st1 { v24.s }[0], [x15], #0x4\n"
- "st1 { v23.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x8], #0x4\n"
+ "st1 { v27.s }[0], [x17], #0x4\n"
+ "st1 { v1.s }[0], [x16], #0x4\n"
+ "st1 { v5.s }[0], [x15], #0x4\n"
"tbz x1, #1, 120f\n"
- "st1 { v7.h }[2], [x17], #0x2\n"
- "st1 { v20.h }[2], [x16], #0x2\n"
- "st1 { v24.h }[2], [x15], #0x2\n"
- "st1 { v23.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x8], #0x2\n"
+ "st1 { v27.h }[2], [x17], #0x2\n"
+ "st1 { v1.h }[2], [x16], #0x2\n"
+ "st1 { v5.h }[2], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[6], [x17], #0x1\n"
- "st1 { v20.b }[6], [x16], #0x1\n"
- "st1 { v24.b }[6], [x15], #0x1\n"
- "st1 { v23.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x8], #0x1\n"
+ "st1 { v27.b }[6], [x17], #0x1\n"
+ "st1 { v1.b }[6], [x16], #0x1\n"
+ "st1 { v5.b }[6], [x15], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[4], [x17], #0x1\n"
- "st1 { v20.b }[4], [x16], #0x1\n"
- "st1 { v24.b }[4], [x15], #0x1\n"
- "st1 { v23.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x8], #0x1\n"
+ "st1 { v27.b }[4], [x17], #0x1\n"
+ "st1 { v1.b }[4], [x16], #0x1\n"
+ "st1 { v5.b }[4], [x15], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
"tbz x1, #1, 122f\n"
- "st1 { v7.h }[0], [x17], #0x2\n"
- "st1 { v20.h }[0], [x16], #0x2\n"
- "st1 { v24.h }[0], [x15], #0x2\n"
- "st1 { v23.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x8], #0x2\n"
+ "st1 { v27.h }[0], [x17], #0x2\n"
+ "st1 { v1.h }[0], [x16], #0x2\n"
+ "st1 { v5.h }[0], [x15], #0x2\n"
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[2], [x17], #0x1\n"
- "st1 { v20.b }[2], [x16], #0x1\n"
- "st1 { v24.b }[2], [x15], #0x1\n"
- "st1 { v23.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x8], #0x1\n"
+ "st1 { v27.b }[2], [x17], #0x1\n"
+ "st1 { v1.b }[2], [x16], #0x1\n"
+ "st1 { v5.b }[2], [x15], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x1, #0, 123f\n"
- "st1 { v7.b }[0], [x17], #0x1\n"
- "st1 { v20.b }[0], [x16], #0x1\n"
- "st1 { v24.b }[0], [x15], #0x1\n"
- "st1 { v23.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x8], #0x1\n"
+ "st1 { v27.b }[0], [x17], #0x1\n"
+ "st1 { v1.b }[0], [x16], #0x1\n"
+ "st1 { v5.b }[0], [x15], #0x1\n"
"123:" // Oddments: Bit 2: End
"124:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
index c2bec4cdab..bb3de6c865 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,21 +45,21 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"lsr x9, %x[n_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v8.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v7.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v7.4s }, [x21]\n"
"ld1r { v6.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v5.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v5.16b }, [x21]\n"
"ld1r { v4.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v3.4s }, [x21]\n"
"ld1r { v2.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
"mov x11, #0x0\n"
+ "ld1r { v1.4s }, [x20]\n"
"cbz x9, 6f\n"
"1:" // Channel loop
"movi v23.4s, #0x0\n"
@@ -68,75 +68,75 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldr q23, [%x[bias], x20]\n"
"2:" // Channel loop: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
- "mov x25, %x[inptrs]\n"
- "ldp x21, x20, [x25], #0x10\n"
- "subs x24, %x[n_points], #0x1\n"
- "ldr s14, [x21, x11]\n"
- "ldr s15, [x20, x11]\n"
+ "mov x23, %x[inptrs]\n"
+ "subs x22, %x[n_points], #0x1\n"
"mov v24.16b, v23.16b\n"
"mov v25.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s16, [x21, x11]\n"
"mov v26.16b, v23.16b\n"
"mov v27.16b, v23.16b\n"
- "ldr s17, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"mov v28.16b, v23.16b\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov v29.16b, v23.16b\n"
- "ldr s18, [x21, x11]\n"
- "ldr s19, [x20, x11]\n"
"mov v30.16b, v23.16b\n"
"mov v31.16b, v23.16b\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s20, [x21, x11]\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v17.8h, v17.8b, v6.8b\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"usubl v20.8h, v20.8b, v6.8b\n"
"usubl v21.8h, v21.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"usubl v22.8h, v22.8b, v6.8b\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x23, x22, [x25], #0x10\n"
- "ldp x21, x20, [x25], #0x10\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldr s14, [x23, x11]\n"
- "ldr s15, [x22, x11]\n"
+ "subs x22, x22, #0x1\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
- "ldr s16, [x21, x11]\n"
- "ldr s17, [x20, x11]\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldp x21, x20, [x25], #0x10\n"
- "ldr s18, [x21, x11]\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
- "ldr s19, [x20, x11]\n"
- "ldp x21, x20, [x25], #0x10\n"
"smlal v31.4s, v22.4h, v0.4h\n"
- "subs x24, x24, #0x1\n"
"ldr s0, [%x[params]], #0x4\n"
- "ldr s20, [x21, x11]\n"
- "ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v14.8h, v14.8b, v6.8b\n"
- "ldr s21, [x20, x11]\n"
- "ldr x20, [x25], #0x8\n"
"usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"usubl v16.8h, v16.8b, v6.8b\n"
- "ldr s22, [x20, x11]\n"
"usubl v17.8h, v17.8b, v6.8b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x23], #0x10\n"
"usubl v18.8h, v18.8b, v6.8b\n"
"usubl v19.8h, v19.8b, v6.8b\n"
+ "ldr s20, [x21, x11]\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x23], #0x8\n"
"usubl v20.8h, v20.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
"usubl v21.8h, v21.8b, v6.8b\n"
"usubl v22.8h, v22.8b, v6.8b\n"
"bgt 3b\n"
@@ -162,27 +162,27 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"sshl v27.4s, v27.4s, v3.4s\n"
"sshl v28.4s, v28.4s, v3.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
"sshl v29.4s, v29.4s, v3.4s\n"
"sshl v30.4s, v30.4s, v3.4s\n"
"sshl v31.4s, v31.4s, v3.4s\n"
- "sshr v18.4s, v18.4s, #0x1f\n"
- "sshr v17.4s, v17.4s, #0x1f\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
"sqrdmulh v26.4s, v26.4s, v2.4s\n"
"sqrdmulh v27.4s, v27.4s, v2.4s\n"
"sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
@@ -254,17 +254,17 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"uzp1 v31.16b, v31.16b, v31.16b\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s23, [x28, x11]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s24, [x27, x11]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s25, [x26, x11]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s26, [x25, x11]\n"
+ "str s23, [x28, x11]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x11]\n"
+ "str s25, [x26, x11]\n"
+ "str s26, [x25, x11]\n"
"str s27, [x24, x11]\n"
"str s28, [x23, x11]\n"
"str s29, [x22, x11]\n"
@@ -290,24 +290,24 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"9:" // Oddments: Load bias: Done
"ldr s0, [%x[params]], #0x4\n"
"mov x10, %x[inptrs]\n"
- "ldp x9, x28, [x10], #0x10\n"
"mov v24.16b, v23.16b\n"
- "ldp x27, x26, [x10], #0x10\n"
- "ldp x25, x24, [x10], #0x10\n"
"mov v25.16b, v23.16b\n"
"mov v26.16b, v23.16b\n"
- "ldp x23, x22, [x10], #0x10\n"
- "ldr x21, [x10], #0x8\n"
"mov v27.16b, v23.16b\n"
"mov v28.16b, v23.16b\n"
"mov v29.16b, v23.16b\n"
+ "ldp x9, x28, [x10], #0x10\n"
"mov v30.16b, v23.16b\n"
- "add x9, x9, x11\n"
- "add x28, x28, x11\n"
"mov v31.16b, v23.16b\n"
"ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "ldp x25, x24, [x10], #0x10\n"
"add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
"add x23, x23, x11\n"
@@ -358,27 +358,27 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ble 15f\n"
"12:" // Oddments: Planar loop
"ldp x9, x28, [x10], #0x10\n"
- "ldp x27, x26, [x10], #0x10\n"
"smlal v23.4s, v14.4h, v0.4h\n"
"smlal v24.4s, v15.4h, v0.4h\n"
- "ldp x25, x24, [x10], #0x10\n"
- "ldp x23, x22, [x10], #0x10\n"
"smlal v25.4s, v16.4h, v0.4h\n"
"smlal v26.4s, v17.4h, v0.4h\n"
"smlal v27.4s, v18.4h, v0.4h\n"
"smlal v28.4s, v19.4h, v0.4h\n"
- "ldr x21, [x10], #0x8\n"
- "add x9, x9, x11\n"
+ "ldp x27, x26, [x10], #0x10\n"
"smlal v29.4s, v20.4h, v0.4h\n"
"smlal v30.4s, v21.4h, v0.4h\n"
+ "add x9, x9, x11\n"
"add x28, x28, x11\n"
- "add x27, x27, x11\n"
"smlal v31.4s, v22.4h, v0.4h\n"
"ldr s0, [%x[params]], #0x4\n"
- "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "add x27, x27, x11\n"
"add x26, x26, x11\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
"add x25, x25, x11\n"
"add x24, x24, x11\n"
+ "ldr x21, [x10], #0x8\n"
"add x23, x23, x11\n"
"add x22, x22, x11\n"
"add x21, x21, x11\n"
@@ -465,36 +465,36 @@ void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
"sshl v25.4s, v25.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
"ldr x20, [%x[outptrs], #0x40]\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
"add x28, x28, x11\n"
- "and v18.16b, v23.16b, v1.16b\n"
- "and v17.16b, v24.16b, v1.16b\n"
"add x27, x27, x11\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
"add x26, x26, x11\n"
- "and v16.16b, v25.16b, v1.16b\n"
- "sshl v26.4s, v26.4s, v3.4s\n"
"add x25, x25, x11\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
"add x24, x24, x11\n"
- "sshl v27.4s, v27.4s, v3.4s\n"
- "sshl v28.4s, v28.4s, v3.4s\n"
"add x23, x23, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
"add x22, x22, x11\n"
- "sshl v29.4s, v29.4s, v3.4s\n"
- "sshl v30.4s, v30.4s, v3.4s\n"
"add x21, x21, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
"add x20, x20, x11\n"
- "sshl v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sshr v18.4s, v18.4s, #0x1f\n"
"sshr v17.4s, v17.4s, #0x1f\n"
"sshr v16.4s, v16.4s, #0x1f\n"
- "sqrdmulh v26.4s, v26.4s, v2.4s\n"
- "sqrdmulh v27.4s, v27.4s, v2.4s\n"
- "sqrdmulh v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v30.4s, v30.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v2.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index ed99f1f642..2a65f9af21 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,21 +49,21 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"lsr x10, %x[n_output_channels], #0x2\n"
"add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
"ld1r { v15.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
- "ld1r { v14.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_maxval]\n"
"add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.4s }, [x21]\n"
"ld1r { v13.16b }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n"
"add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v12.16b }, [x21]\n"
"ld1r { v11.4s }, [x20]\n"
- "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
- "ld1r { v10.4s }, [x20]\n"
+ "add x21, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v10.4s }, [x21]\n"
"ld1r { v9.4s }, [x20]\n"
"add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
- "ld1r { v8.4s }, [x20]\n"
"mov x9, #0x0\n"
+ "ld1r { v8.4s }, [x20]\n"
"cbz x10, 9f\n"
"1:" // Output channel loop
"movi v31.4s, #0x0\n"
@@ -96,20 +96,20 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"3:" // Output channel loop: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 7f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"beq 5f\n"
@@ -125,13 +125,13 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -139,22 +139,22 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -172,54 +172,54 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smlal v19.4s, v5.4h, v0.h[3]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "smlal v16.4s, v7.4h, v3.h[0]\n"
- "smlal v17.4s, v7.4h, v3.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
- "smlal v18.4s, v7.4h, v3.h[2]\n"
- "smlal v19.4s, v7.4h, v3.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v7.4h, v6.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -357,49 +357,49 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -421,70 +421,70 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x20, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr s5, [%x[weights]], #0x4\n"
"ldr d4, [x28, #0x0]\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "smlal v20.4s, v7.4h, v3.h[4]\n"
- "smlal v21.4s, v7.4h, v3.h[5]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "smlal v22.4s, v7.4h, v3.h[6]\n"
- "smlal v23.4s, v7.4h, v3.h[7]\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
"smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v19.4s, v19.4s, v9.4s\n"
- "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
- "and v3.16b, v16.16b, v8.16b\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
- "and v2.16b, v17.16b, v8.16b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
- "and v1.16b, v18.16b, v8.16b\n"
- "and v0.16b, v19.16b, v8.16b\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
"sshl v20.4s, v20.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
"sshl v21.4s, v21.4s, v10.4s\n"
"sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
"sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -622,49 +622,49 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -673,45 +673,45 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"7:" // Output channel loop: Single kernel point
"smlal v16.4s, v5.4h, v0.h[0]\n"
"smlal v17.4s, v5.4h, v0.h[1]\n"
- "sshl v16.4s, v16.4s, v10.4s\n"
"ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
"smlal v18.4s, v5.4h, v0.h[2]\n"
"smlal v19.4s, v5.4h, v0.h[3]\n"
- "sshl v17.4s, v17.4s, v10.4s\n"
- "ldr x26, [%x[outptrs], #0x8]\n"
- "sshl v18.4s, v18.4s, v10.4s\n"
- "sshl v19.4s, v19.4s, v10.4s\n"
- "smlal v20.4s, v5.4h, v0.h[4]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
"smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
"smlal v22.4s, v5.4h, v0.h[6]\n"
- "sqrdmulh v16.4s, v16.4s, v9.4s\n"
- "ldr x24, [%x[outptrs], #0x18]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
- "sqrdmulh v17.4s, v17.4s, v9.4s\n"
- "ldr x23, [%x[outptrs], #0x20]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
- "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
- "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
"and v3.16b, v16.16b, v8.16b\n"
- "smlal v27.4s, v5.4h, v4.h[3]\n"
- "ldr x21, [%x[outptrs], #0x30]\n"
"and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
"and v1.16b, v18.16b, v8.16b\n"
- "smlal v28.4s, v5.4h, v4.h[4]\n"
- "ldr x20, [%x[outptrs], #0x38]\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "smlal v29.4s, v5.4h, v4.h[5]\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "smlal v30.4s, v5.4h, v4.h[6]\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
- "sshl v24.4s, v24.4s, v10.4s\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
"sshr v2.4s, v2.4s, #0x1f\n"
@@ -848,49 +848,49 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smax v30.4s, v30.4s, v15.4s\n"
"smax v31.4s, v31.4s, v15.4s\n"
"uzp1 v16.16b, v16.16b, v16.16b\n"
- "str s16, [x27, x9]\n"
- "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v18.16b, v18.16b, v18.16b\n"
- "str s17, [x26, x9]\n"
- "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v19.16b, v19.16b, v19.16b\n"
"uzp1 v20.16b, v20.16b, v20.16b\n"
- "str s18, [x25, x9]\n"
- "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v21.16b, v21.16b, v21.16b\n"
"uzp1 v22.16b, v22.16b, v22.16b\n"
- "str s19, [x24, x9]\n"
- "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s20, [x23, x9]\n"
- "ldr x23, [%x[outptrs], #0x60]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s21, [x22, x9]\n"
- "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s22, [x21, x9]\n"
- "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s23, [x20, x9]\n"
- "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
"uzp1 v24.16b, v24.16b, v24.16b\n"
- "str s24, [x27, x9]\n"
"uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
"uzp1 v26.16b, v26.16b, v26.16b\n"
- "str s25, [x26, x9]\n"
"uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
"uzp1 v28.16b, v28.16b, v28.16b\n"
- "str s26, [x25, x9]\n"
"uzp1 v29.16b, v29.16b, v29.16b\n"
"uzp1 v30.16b, v30.16b, v30.16b\n"
- "str s27, [x24, x9]\n"
"uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s24, [x27, x9]\n"
+ "str s25, [x26, x9]\n"
+ "str s26, [x25, x9]\n"
+ "str s27, [x24, x9]\n"
"str s28, [x23, x9]\n"
"str s29, [x22, x9]\n"
"str s30, [x21, x9]\n"
@@ -965,20 +965,20 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"18:" // Output channel oddments: Load quantization parameters: Done
"ldr s5, [%x[weights]], #0x4\n"
"mov x22, %x[inptrs]\n"
- "ldp x21, x20, [x22], #0x10\n"
"lsr x23, %x[kernel_points], #0x1\n"
+ "ldp x21, x20, [x22], #0x10\n"
"ldr d0, [x21, #0x0]\n"
"ldr d4, [x20, #0x0]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"usubl v0.8h, v0.8b, v13.8b\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"cbz x23, 22f\n"
"ldr s7, [%x[weights]], #0x4\n"
"ldp x21, x20, [x22], #0x10\n"
"subs x23, x23, #0x1\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
"ldr d3, [x21, #0x0]\n"
"ldr d6, [x20, #0x0]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
"usubl v3.8h, v3.8b, v13.8b\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"beq 20f\n"
@@ -994,13 +994,13 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d0, [x21, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d4, [x20, #0x0]\n"
@@ -1008,22 +1008,22 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"ldp x21, x20, [x22], #0x10\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
"ldr d3, [x21, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
"smlal v27.4s, v7.4h, v6.h[3]\n"
"smlal v28.4s, v7.4h, v6.h[4]\n"
"smlal v29.4s, v7.4h, v6.h[5]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
"smlal v30.4s, v7.4h, v6.h[6]\n"
"smlal v31.4s, v7.4h, v6.h[7]\n"
"ldr d6, [x20, #0x0]\n"
@@ -1077,27 +1077,27 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"smlal v22.4s, v5.4h, v0.h[6]\n"
"smlal v23.4s, v5.4h, v0.h[7]\n"
"ldr d2, [x21, #0x0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
"smlal v24.4s, v5.4h, v4.h[0]\n"
"smlal v25.4s, v5.4h, v4.h[1]\n"
"smlal v26.4s, v5.4h, v4.h[2]\n"
"smlal v27.4s, v5.4h, v4.h[3]\n"
"smlal v28.4s, v5.4h, v4.h[4]\n"
"smlal v29.4s, v5.4h, v4.h[5]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
"smlal v30.4s, v5.4h, v4.h[6]\n"
"smlal v31.4s, v5.4h, v4.h[7]\n"
"ldr d1, [x20, #0x0]\n"
"ldr s0, [%x[weights]], #0x4\n"
"smlal v16.4s, v7.4h, v3.h[0]\n"
"smlal v17.4s, v7.4h, v3.h[1]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
"smlal v18.4s, v7.4h, v3.h[2]\n"
"smlal v19.4s, v7.4h, v3.h[3]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
"smlal v20.4s, v7.4h, v3.h[4]\n"
"smlal v21.4s, v7.4h, v3.h[5]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
"smlal v22.4s, v7.4h, v3.h[6]\n"
"smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
"smlal v24.4s, v7.4h, v6.h[0]\n"
"smlal v25.4s, v7.4h, v6.h[1]\n"
"smlal v26.4s, v7.4h, v6.h[2]\n"
@@ -1145,18 +1145,18 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"sshl v17.4s, v17.4s, v10.4s\n"
"sshl v18.4s, v18.4s, v10.4s\n"
"sshl v19.4s, v19.4s, v10.4s\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
"sqrdmulh v16.4s, v16.4s, v9.4s\n"
"sqrdmulh v17.4s, v17.4s, v9.4s\n"
"sqrdmulh v18.4s, v18.4s, v9.4s\n"
"sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
"and v3.16b, v16.16b, v8.16b\n"
"and v2.16b, v17.16b, v8.16b\n"
"and v1.16b, v18.16b, v8.16b\n"
"and v0.16b, v19.16b, v8.16b\n"
- "sshl v20.4s, v20.4s, v10.4s\n"
- "sshl v21.4s, v21.4s, v10.4s\n"
- "sshl v22.4s, v22.4s, v10.4s\n"
- "sshl v23.4s, v23.4s, v10.4s\n"
"sshl v24.4s, v24.4s, v10.4s\n"
"sshl v25.4s, v25.4s, v10.4s\n"
"sshr v3.4s, v3.4s, #0x1f\n"
@@ -1320,47 +1320,47 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"tbz %x[n_output_channels], #1, 24f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.h }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.h }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.h }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.h }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.h }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.h }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.h }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.h }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
- "add x9, x9, #0x2\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.h }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.h }[0], [x26]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
"st1 { v26.h }[0], [x25]\n"
"st1 { v27.h }[0], [x24]\n"
"st1 { v28.h }[0], [x23]\n"
@@ -1370,46 +1370,46 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"tbz %x[n_output_channels], #0, 25f\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[2], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[2], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[2], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[2], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[2], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[2], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[2], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[2], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[2], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[2], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[2], [x25]\n"
"st1 { v27.b }[2], [x24]\n"
"st1 { v28.b }[2], [x23]\n"
@@ -1420,46 +1420,46 @@ void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
"24:" // Output channel oddments: Done: Store: Bit 1: Unset
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "add x27, x27, x9\n"
- "add x26, x26, x9\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "add x25, x25, x9\n"
- "add x24, x24, x9\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "add x23, x23, x9\n"
- "add x22, x22, x9\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "add x21, x21, x9\n"
- "add x20, x20, x9\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
"st1 { v16.b }[0], [x27]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "add x27, x27, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v17.b }[0], [x26]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "add x26, x26, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"st1 { v18.b }[0], [x25]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "add x25, x25, x9\n"
"st1 { v19.b }[0], [x24]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "add x24, x24, x9\n"
+ "add x27, x27, x9\n"
"st1 { v20.b }[0], [x23]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "add x23, x23, x9\n"
+ "add x26, x26, x9\n"
"st1 { v21.b }[0], [x22]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "add x22, x22, x9\n"
+ "add x25, x25, x9\n"
"st1 { v22.b }[0], [x21]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "add x21, x21, x9\n"
+ "add x24, x24, x9\n"
"st1 { v23.b }[0], [x20]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "add x20, x20, x9\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
"st1 { v24.b }[0], [x27]\n"
+ "add x21, x21, x9\n"
"st1 { v25.b }[0], [x26]\n"
+ "add x20, x20, x9\n"
"st1 { v26.b }[0], [x25]\n"
"st1 { v27.b }[0], [x24]\n"
"st1 { v28.b }[0], [x23]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 2b6f70c089..74a68d6929 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
namespace arm_conv {
namespace depthwise {
@@ -65,3 +67,5 @@ class sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 2d558ade3f..ca58dbc10f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -102,33 +102,33 @@ void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"mul x20, x4, x21\n" // offset = tile_i * ld_input_row
"ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x5, x6, x20\n" // offset += tile_j * ld_input_col
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"add x17, x6, x6\n"
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x16, x17, x6\n"
"add x7, x7, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x16, x7, x21, LSL #1\n"
- "add x15, x17, x6\n"
- "add x14, x16, x21, LSL #1\n"
+ "add x15, x7, x21, LSL #1\n"
+ "add x14, x15, x21, LSL #1\n"
"add x13, x14, x21, LSL #1\n"
"cbnz x5, 2f\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"lsl x12, %x[n_channels], #0x1\n"
"mov x21, #0x4\n"
"mul x21, x21, x6\n"
- "add x11, x16, x6, LSL #1\n"
- "add x10, x7, x15, LSL #1\n"
- "add x9, x16, x17, LSL #1\n"
- "sub x20, x24, x5\n"
+ "add x11, x15, x6, LSL #1\n"
+ "add x10, x7, x16, LSL #1\n"
+ "add x9, x15, x17, LSL #1\n"
+ "sub x20, x20, x5\n"
"add x28, x14, x6, LSL #1\n"
"sub x20, x20, #0x1\n"
- "add x27, x13, x15, LSL #1\n"
+ "add x27, x13, x16, LSL #1\n"
"and x20, x20, #0x3fffff\n"
"add x26, x7, x6, LSL #1\n"
"orr x12, x12, x20, LSL #22\n"
"add x25, x7, x17, LSL #1\n"
"orr x12, x12, x21, LSL #38\n"
"add x24, x14, x17, LSL #1\n"
- "add x23, x16, x15, LSL #1\n"
- "add x22, x14, x15, LSL #1\n"
+ "add x23, x15, x16, LSL #1\n"
+ "add x22, x14, x16, LSL #1\n"
"add x21, x13, x6, LSL #1\n"
"add x20, x13, x17, LSL #1\n"
".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
@@ -141,187 +141,187 @@ void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x20, #0x2\n"
- "ld1h { z18.h }, p3/Z, [x8]\n"
+ "ld1h { z22.h }, p3/Z, [x8]\n"
"addvl x8, x8, #1\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x24\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x25\n"
".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
"addvl x8, x8, #4\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
"addvl x8, x8, #4\n"
- "mul x22, x4, x26\n" // offset = tile_i * ld_output_row
- "cmp x24, %x[n_channels]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "madd x22, x5, x25, x22\n" // offset += tile_j * ld_output_col
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mul x22, x4, x23\n" // offset = tile_i * ld_output_row
+ "cmp x25, %x[n_channels]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x5, x26, x22\n" // offset += tile_j * ld_output_col
+ "ld1rh { z21.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mov x21, #0x0\n"
"mul x22, x22, x20\n" // offset *= output_tile_size
- "sub x20, XZR, x24\n"
+ "sub x20, XZR, x25\n"
"ld1h { z8.h }, p3/Z, [x8]\n"
- "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z9.h }, p2/Z, [x16, x6, LSL #1]\n"
+ "add x24, x24, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
"addvl x8, x8, #1\n"
- "add x22, x23, x26, LSL #1\n"
+ "add x23, x24, x23, LSL #1\n"
"ld1h { z10.h }, p2/Z, [x7]\n"
- "ld1h { z11.h }, p2/Z, [x7, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x15, x17, LSL #1]\n"
"ld1h { z13.h }, p2/Z, [x14, x6, LSL #1]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "whilelt p1.h, x24, %x[n_channels]\n"
+ "movprfx z24, z22\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z22\n fmla z25.h, p3/M, z3.h, z9.h\n"
+ "whilelt p1.h, x25, %x[n_channels]\n"
"inch x21\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "inch x24\n"
- "ld1h { z18.h }, p3/Z, [x8]\n"
+ "movprfx z26, z22\n fmla z26.h, p3/M, z1.h, z9.h\n"
+ "movprfx z27, z22\n fmla z27.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z17.h }, p2/Z, [x13]\n"
+ "inch x25\n"
+ "ld1h { z22.h }, p3/Z, [x8]\n"
"addvl x8, x8, #1\n"
"mov p0.b, p2.b\n"
"inch x20\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z28.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z17.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z14.h }, p2/Z, [x7, x17, LSL #1]\n"
"addvl x7, x7, #1\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
- "addvl x16, x16, #1\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x14]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z25.h, p3/M, z6.h, z13.h\n"
+ "fmla z26.h, p3/M, z4.h, z13.h\n"
+ "fmla z27.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x15]\n"
+ "fmla z24.h, p3/M, z1.h, z28.h\n"
+ "fmla z25.h, p3/M, z0.h, z28.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x16, LSL #1]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "fmla z27.h, p3/M, z4.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z14.h\n"
+ "fmla z25.h, p3/M, z1.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x14]\n"
+ "fmla z26.h, p3/M, z0.h, z17.h\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x16, LSL #1]\n"
"addvl x14, x14, #1\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
"ld1h { z13.h }, p1/Z, [x14, x6, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
"addvl x8, x8, #4\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "cmp x24, %x[n_channels]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
"addvl x13, x13, #1\n"
- "ld1h { z11.h }, p1/Z, [x7, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x16, x6, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z27.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z11.h }, p1/Z, [x7, x16, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z9.h }, p1/Z, [x15, x6, LSL #1]\n"
"ld1h { z10.h }, p1/Z, [x7]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmla z26.h, p3/M, z8.h, z16.h\n"
+ "fmla z27.h, p3/M, z7.h, z16.h\n"
".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
"addvl x8, x8, #4\n"
- "ld1h { z12.h }, p1/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x15, x17, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x8]\n"
"addvl x8, x8, #1\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ ".inst 0xc175c9f8 // fclamp { z24.h-z27.h }, z15.h, z21.h\n"
+ "st1h { z24.h }, p0, [x24]\n"
+ "st1h { z25.h }, p0, [x24, x26, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z26.h }, p0, [x23]\n"
+ "st1h { z27.h }, p0, [x23, x26, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "movprfx z28, z22\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z22\n fmla z29.h, p3/M, z3.h, z9.h\n"
"ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"mov p0.b, p2.b\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
+ "movprfx z30, z22\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z22\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x13]\n"
"ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"add x5, x5, #0x1\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
"fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x13, x16, LSL #1]\n"
"add x20, x4, #0x1\n"
"fmla z30.h, p3/M, z2.h, z12.h\n"
"fmla z31.h, p3/M, z1.h, z12.h\n"
- "cmp x5, x24\n"
+ "ld1h { z20.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "cmp x5, x22\n"
"csel x4, x4, x20, LT\n"
"csel x5, x5, XZR, LT\n"
"cmp x4, x21\n"
"fmla z28.h, p3/M, z5.h, z12.h\n"
"fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z18.h\n"
"fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x7, x17, LSL #1]\n"
"fmla z28.h, p3/M, z7.h, z13.h\n"
"fmla z29.h, p3/M, z6.h, z13.h\n"
"fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x14]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "fmla z31.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x15]\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z20.h\n"
+ "fmla z31.h, p3/M, z4.h, z20.h\n"
+ "fmla z28.h, p3/M, z2.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x14]\n"
+ "fmla z30.h, p3/M, z0.h, z17.h\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "fmla z28.h, p3/M, z8.h, z20.h\n"
+ "fmla z29.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z19.h\n"
+ "fmla z31.h, p3/M, z5.h, z18.h\n"
+ "fmla z28.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z17.h\n"
+ "fmla z28.h, p3/M, z6.h, z19.h\n"
+ "fmla z29.h, p3/M, z8.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ ".inst 0xc175c9fc // fclamp { z28.h-z31.h }, z15.h, z21.h\n"
+ "st1h { z28.h }, p0, [x24]\n"
+ "st1h { z29.h }, p0, [x24, x26, LSL #1]\n"
+ "st1h { z30.h }, p0, [x23]\n"
+ "st1h { z31.h }, p0, [x23, x26, LSL #1]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
@@ -333,4 +333,4 @@ void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 415e344832..b4449ec76f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -85,185 +85,185 @@ void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ldr x13, [x16, #0x20]\n"
- "cnth x12\n"
+ "ldr x24, [x16, #0x20]\n"
+ "cnth x13\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x11, x10, [x20, #0x0]\n"
- "cmp x12, %x[n_channels]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x9, XZR, x12\n"
- "ldp x28, x27, [x20, #0x10]\n"
- "ld1h { z16.h }, p3/Z, [x14]\n"
+ "ld1rh { z22.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x10, XZR, x13\n"
+ "ldp x9, x28, [x20, #0x10]\n"
+ "ld1h { z20.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x23, x22, [x16, #0x0]\n"
".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
"ld1h { z8.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x16, #0x28]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z20\n fmla z25.h, p3/M, z3.h, z9.h\n"
+ "ldr x20, [x16, #0x28]\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "movprfx z26, z20\n fmla z26.h, p3/M, z1.h, z9.h\n"
+ "movprfx z27, z20\n fmla z27.h, p3/M, z0.h, z9.h\n"
"ldr x21, [x16, #0x30]\n"
- "ld1h { z16.h }, p3/Z, [x14]\n"
- "ldr x20, [x16, #0x38]\n"
+ "ld1h { z20.h }, p3/Z, [x14]\n"
+ "ldr x24, [x16, #0x38]\n"
"addvl x14, x14, #1\n"
- "inch x9\n"
- "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x25, [x16, #0x48]\n"
+ "inch x10\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x48]\n"
"mov p0.b, p2.b\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
- "ldr x26, [x16, #0x40]\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x24, [x16, #0x50]\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x23, [x16, #0x58]\n"
- "fmla z28.h, p3/M, z5.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
- "ldr x13, [x16, #0x60]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ldr x22, [x16, #0x68]\n"
- "ldr x21, [x16, #0x70]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z13.h\n"
- "ldr x20, [x16, #0x78]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldp x26, x25, [x16, #0x0]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
- "ldp x24, x23, [x16, #0x10]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x13, [x16, #0x20]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "ld1h { z13.h }, p1/Z, [x13, x12, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ldr x22, [x16, #0x50]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "ldr x20, [x16, #0x60]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z25.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z28.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x68]\n"
+ "fmla z26.h, p3/M, z6.h, z17.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z14.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z25.h, p3/M, z6.h, z13.h\n"
+ "fmla z26.h, p3/M, z4.h, z13.h\n"
+ "fmla z27.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z28.h\n"
+ "fmla z25.h, p3/M, z0.h, z28.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "fmla z27.h, p3/M, z4.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z14.h\n"
+ "fmla z25.h, p3/M, z1.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmla z26.h, p3/M, z0.h, z17.h\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z13.h }, p1/Z, [x20, x13, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "inch x15\n"
".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
- "inch x15\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
"whilelt p2.h, x15, %x[n_channels]\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
- "inch x12\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z27.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z11.h }, p1/Z, [x22, x13, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z9.h }, p1/Z, [x24, x13, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x23, x13, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z16.h\n"
+ "fmla z27.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z12.h }, p1/Z, [x21, x13, LSL #1]\n"
+ "inch x13\n"
".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "cmp x12, %x[n_channels]\n"
+ "cmp x13, %x[n_channels]\n"
"ld1h { z8.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
- "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
- "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
- "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
- "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+ ".inst 0xc16fcad8 // fclamp { z24.h-z27.h }, z22.h, z15.h\n"
+ "st1h { z24.h }, p0, [x12, x10, LSL #1]\n"
+ "st1h { z25.h }, p0, [x11, x10, LSL #1]\n"
+ "st1h { z26.h }, p0, [x9, x10, LSL #1]\n"
+ "st1h { z27.h }, p0, [x28, x10, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
- "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
- "ldr x22, [x16, #0x28]\n"
- "inch x9\n"
- "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ldr x21, [x16, #0x30]\n"
+ "movprfx z28, z20\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z20\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "inch x10\n"
+ "movprfx z30, z20\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z20\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x20, [x16, #0x30]\n"
"mov p0.b, p2.b\n"
- "ldr x20, [x16, #0x38]\n"
- "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x25, [x16, #0x48]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0x48]\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
"fmla z29.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
- "ldr x26, [x16, #0x40]\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z30.h, p3/M, z2.h, z12.h\n"
"fmla z31.h, p3/M, z1.h, z12.h\n"
- "ldr x24, [x16, #0x50]\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x23, [x16, #0x58]\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
"fmla z28.h, p3/M, z5.h, z12.h\n"
"fmla z29.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
- "ldr x13, [x16, #0x60]\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldr x22, [x16, #0x68]\n"
+ "fmla z30.h, p3/M, z6.h, z18.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla z28.h, p3/M, z7.h, z13.h\n"
"fmla z29.h, p3/M, z6.h, z13.h\n"
- "ldr x20, [x16, #0x78]\n"
"fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z1.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z11.h\n"
- "fmla z31.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z12.h\n"
- ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
- "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
- "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
- "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
- "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+ "fmla z31.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z20.h\n"
+ "fmla z31.h, p3/M, z4.h, z20.h\n"
+ "fmla z28.h, p3/M, z2.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z17.h\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "fmla z28.h, p3/M, z8.h, z20.h\n"
+ "fmla z29.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z19.h\n"
+ "fmla z31.h, p3/M, z5.h, z18.h\n"
+ "fmla z28.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z17.h\n"
+ "fmla z28.h, p3/M, z6.h, z19.h\n"
+ "fmla z29.h, p3/M, z8.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ ".inst 0xc16fcadc // fclamp { z28.h-z31.h }, z22.h, z15.h\n"
+ "st1h { z28.h }, p0, [x12, x10, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x10, LSL #1]\n"
+ "st1h { z30.h }, p0, [x9, x10, LSL #1]\n"
+ "st1h { z31.h }, p0, [x28, x10, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
@@ -274,4 +274,4 @@ void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index f90fbc3906..9622603947 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
namespace arm_conv {
namespace depthwise {
@@ -65,3 +67,5 @@ class sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 3a7d1cb0b4..a2fe312a1c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -102,56 +102,56 @@ void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
"mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"add x7, x4, x4\n"
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x8, x5, x21, LSL #1\n"
- "add x17, x7, x4\n"
- "add x16, x8, x21, LSL #1\n"
- "add x15, x17, x4\n"
- "add x14, x16, x21, LSL #1\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #1\n"
+ "add x15, x16, x21, LSL #1\n"
+ "add x14, x15, x21, LSL #1\n"
"add x13, x14, x21, LSL #1\n"
"cbnz x3, 2f\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"lsl x12, %x[n_channels], #0x1\n"
"mov x28, #0x6\n"
"mul x28, x28, x4\n"
- "add x27, x16, x7, LSL #1\n"
- "add x26, x5, x15, LSL #1\n"
- "add x25, x8, x7, LSL #1\n"
- "sub x20, x9, x3\n"
- "add x24, x13, x15, LSL #1\n"
+ "add x27, x15, x7, LSL #1\n"
+ "add x26, x5, x17, LSL #1\n"
+ "add x25, x16, x7, LSL #1\n"
+ "sub x20, x20, x3\n"
+ "add x24, x13, x17, LSL #1\n"
"sub x20, x20, #0x1\n"
- "add x23, x16, x4, LSL #1\n"
+ "add x23, x15, x4, LSL #1\n"
"and x20, x20, #0x3fffff\n"
"add x22, x5, x4, LSL #1\n"
"orr x12, x12, x20, LSL #22\n"
- "add x21, x5, x17, LSL #1\n"
+ "add x21, x5, x8, LSL #1\n"
"orr x12, x12, x28, LSL #38\n"
- "add x20, x16, x17, LSL #1\n"
- "add x11, x8, x15, LSL #1\n"
+ "add x20, x15, x8, LSL #1\n"
+ "add x11, x16, x17, LSL #1\n"
"add x10, x14, x7, LSL #1\n"
- "add x9, x14, x15, LSL #1\n"
+ "add x9, x14, x17, LSL #1\n"
"add x28, x13, x4, LSL #1\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
- "add x27, x8, x4, LSL #1\n"
+ "add x27, x16, x4, LSL #1\n"
".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
- "add x26, x8, x17, LSL #1\n"
+ "add x26, x16, x8, LSL #1\n"
".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
- "add x25, x13, x17, LSL #1\n"
+ "add x25, x13, x8, LSL #1\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
"add x24, x14, x4, LSL #1\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
"add x23, x5, x7, LSL #1\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- "add x22, x14, x17, LSL #1\n"
+ "add x22, x14, x8, LSL #1\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
- "add x21, x16, x15, LSL #1\n"
+ "add x21, x15, x17, LSL #1\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"add x20, x13, x7, LSL #1\n"
- ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
@@ -163,312 +163,312 @@ void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x21, #0x3\n"
- "ld1h { z18.h }, p3/Z, [x6]\n"
+ "ld1h { z25.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
"ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x26\n"
+ "cnth x22\n"
".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "mul x20, x2, x22\n" // offset = tile_i * ld_output_row
- "cmp x26, %x[n_channels]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x20, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x22, %x[n_channels]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"madd x20, x3, x27, x20\n" // offset += tile_j * ld_output_col
- "add x24, x27, x27\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x25, x27, x27\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mul x20, x20, x21\n" // offset *= output_tile_size
"mov x21, #0x0\n"
"ld1h { z8.h }, p3/Z, [x6]\n"
- "add x25, x25, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "sub x20, XZR, x26\n"
- "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
- "add x23, x25, x22, LSL #1\n"
+ "add x26, x26, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "sub x20, XZR, x22\n"
+ "ld1h { z9.h }, p2/Z, [x15, x7, LSL #1]\n"
+ "add x24, x26, x23, LSL #1\n"
"ld1h { z10.h }, p2/Z, [x5]\n"
"addvl x6, x6, #1\n"
- "add x22, x23, x22, LSL #1\n"
- "ld1h { z11.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "add x23, x24, x23, LSL #1\n"
+ "ld1h { z11.h }, p2/Z, [x5, x17, LSL #1]\n"
"ld1h { z12.h }, p2/Z, [x13]\n"
- "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x16, x7, LSL #1]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x26, %x[n_channels]\n"
+ "movprfx z28, z25\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "movprfx z23, z25\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x22, %x[n_channels]\n"
"inch x21\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "inch x26\n"
+ "movprfx z29, z25\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z30, z25\n fmla z30.h, p3/M, z5.h, z9.h\n"
+ "inch x22\n"
"mov p0.b, p2.b\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "movprfx z31, z25\n fmla z31.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z25\n fmla z16.h, p3/M, z3.h, z9.h\n"
"inch x20\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "movprfx z17, z25\n fmla z17.h, p3/M, z2.h, z9.h\n"
+ "movprfx z19, z25\n fmla z19.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
"fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "fmla z16.h, p3/M, z0.h, z13.h\n"
+ "fmla z17.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "movprfx z18, z25\n fmla z18.h, p3/M, z1.h, z9.h\n"
+ "fmla z28.h, p3/M, z6.h, z20.h\n"
"fmla z23.h, p3/M, z5.h, z13.h\n"
- "ld1h { z18.h }, p3/Z, [x6]\n"
+ "ld1h { z25.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8]\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x14]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "addvl x8, x8, #1\n"
- "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z27.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z20.h\n"
+ "fmla z19.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z24.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z20.h\n"
+ "fmla z18.h, p3/M, z0.h, z20.h\n"
+ "fmla z17.h, p3/M, z1.h, z20.h\n"
+ "fmla z28.h, p3/M, z0.h, z27.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z21.h }, p2/Z, [x16]\n"
+ "fmla z29.h, p3/M, z1.h, z24.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z19.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "fmla z28.h, p3/M, z2.h, z24.h\n"
+ "fmla z23.h, p3/M, z1.h, z27.h\n"
+ "ld1h { z13.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x14]\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z16.h, p3/M, z2.h, z13.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z27.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z23.h, p3/M, z3.h, z21.h\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z22.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z20.h\n"
+ "ld1h { z20.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z27.h\n"
+ "fmla z19.h, p3/M, z3.h, z27.h\n"
+ "ld1h { z21.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z27.h\n"
+ "fmla z16.h, p3/M, z6.h, z27.h\n"
+ "fmla z17.h, p3/M, z5.h, z27.h\n"
+ "fmla z30.h, p3/M, z8.h, z27.h\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "fmla z19.h, p3/M, z5.h, z22.h\n"
+ "fmla z18.h, p3/M, z6.h, z20.h\n"
+ "fmla z16.h, p3/M, z8.h, z22.h\n"
+ "fmla z31.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x8, LSL #1]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z17.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z20.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z23.h, p3/M, z4.h, z21.h\n"
+ "fmla z30.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "fmla z18.h, p3/M, z8.h, z20.h\n"
+ "fmla z19.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x8, LSL #1]\n"
"addvl x14, x14, #1\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x5, x7, LSL #1]\n"
"addvl x5, x5, #1\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z17.h, p3/M, z4.h, z21.h\n"
+ "fmla z30.h, p3/M, z7.h, z21.h\n"
"ld1h { z10.h }, p1/Z, [x5]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x16]\n"
- "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "addvl x16, x16, #1\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
- "addvl x6, x6, #4\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z18.h, p3/M, z3.h, z21.h\n"
+ "fmla z23.h, p3/M, z2.h, z20.h\n"
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z21.h\n"
+ "ld1h { z11.h }, p2/Z, [x15]\n"
+ "fmla z28.h, p3/M, z1.h, z20.h\n"
+ "fmla z29.h, p3/M, z0.h, z20.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, x17, LSL #1]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z16.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z9.h }, p1/Z, [x15, x7, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "fmla z17.h, p3/M, z0.h, z11.h\n"
+ "fmla z19.h, p3/M, z2.h, z20.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
"ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
"whilelt p2.h, x21, %x[n_channels]\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z20.h\n"
+ "fmla z16.h, p3/M, z5.h, z20.h\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
"addvl x13, x13, #1\n"
- "cmp x26, %x[n_channels]\n"
- "ld1h { z11.h }, p1/Z, [x5, x15, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "cmp x22, %x[n_channels]\n"
+ "ld1h { z11.h }, p1/Z, [x5, x17, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z13.h\n"
+ "fmla z18.h, p3/M, z7.h, z13.h\n"
"ld1h { z12.h }, p1/Z, [x13]\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "fmla z19.h, p3/M, z6.h, z13.h\n"
".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- ".inst 0xc170ca38 // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
- "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+ ".inst 0xc16ec9fc // fclamp { z28.h-z31.h }, z15.h, z14.h\n"
+ "ld1h { z13.h }, p1/Z, [x16, x7, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
"ld1h { z8.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z26.h }, p0, [x23]\n"
- "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
- "st1h { z23.h }, p0, [x25]\n"
- "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
- "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
- "addvl x25, x25, #1\n"
- "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+ ".inst 0xc16ec9f0 // fclamp { z16.h-z19.h }, z15.h, z14.h\n"
+ "st1h { z30.h }, p0, [x24]\n"
+ "st1h { z23.h }, p0, [x26]\n"
+ "st1h { z28.h }, p0, [x26, x27, LSL #1]\n"
+ "st1h { z29.h }, p0, [x26, x25, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "st1h { z31.h }, p0, [x24, x27, LSL #1]\n"
+ "st1h { z16.h }, p0, [x24, x25, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z17.h }, p0, [x23]\n"
+ "st1h { z18.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z19.h }, p0, [x23, x25, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z29.h }, p0, [x22]\n"
- "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
- "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z20, z25\n fmla z20.h, p3/M, z7.h, z9.h\n"
+ "movprfx z24, z25\n fmla z24.h, p3/M, z8.h, z9.h\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"mov p0.b, p2.b\n"
- "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z21, z25\n fmla z21.h, p3/M, z6.h, z9.h\n"
+ "movprfx z22, z25\n fmla z22.h, p3/M, z5.h, z9.h\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z23, z25\n fmla z23.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z25\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z29, z25\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z25\n fmla z31.h, p3/M, z0.h, z9.h\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"add x3, x3, #0x1\n"
- "fmla z24.h, p3/M, z4.h, z13.h\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z13.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, x8, LSL #1]\n"
"add x20, x2, #0x1\n"
- "fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "cmp x3, x9\n"
- "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z21.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z13.h\n"
+ "cmp x3, x22\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
"fmla z28.h, p3/M, z0.h, z13.h\n"
"csel x2, x2, x20, LT\n"
"csel x3, x3, XZR, LT\n"
"fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "movprfx z30, z25\n fmla z30.h, p3/M, z1.h, z9.h\n"
"cmp x2, x21\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z25.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8]\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x14]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x16]\n"
- "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z23.h, p3/M, z23.h, z17.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- ".inst 0xc170ca38 // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z26.h }, p0, [x23]\n"
- "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
- "st1h { z23.h }, p0, [x25]\n"
- "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
- "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
- "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
- "st1h { z29.h }, p0, [x22]\n"
- "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
- "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "fmla z20.h, p3/M, z6.h, z18.h\n"
+ "fmla z24.h, p3/M, z5.h, z13.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z31.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "fmla z23.h, p3/M, z3.h, z18.h\n"
+ "fmla z30.h, p3/M, z0.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z17.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x16]\n"
+ "fmla z21.h, p3/M, z1.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z19.h\n"
+ "fmla z31.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z5.h, z19.h\n"
+ "fmla z30.h, p3/M, z2.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z20.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x14]\n"
+ "fmla z21.h, p3/M, z7.h, z19.h\n"
+ "fmla z28.h, p3/M, z2.h, z17.h\n"
+ "fmla z20.h, p3/M, z8.h, z19.h\n"
+ "fmla z29.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z19.h\n"
+ "fmla z31.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z19.h\n"
+ "fmla z28.h, p3/M, z6.h, z19.h\n"
+ "fmla z29.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z8.h, z19.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z31.h, p3/M, z5.h, z18.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z8.h, z18.h\n"
+ "fmla z23.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x16, x8, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z17.h\n"
+ "fmla z22.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z20.h, p3/M, z5.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z30.h, p3/M, z3.h, z17.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z31.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x15]\n"
+ "fmla z20.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z19.h\n"
+ "fmla z30.h, p3/M, z5.h, z19.h\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z29.h, p3/M, z0.h, z18.h\n"
+ "fmla z31.h, p3/M, z2.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z16.h\n"
+ "fmla z30.h, p3/M, z7.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z16.h\n"
+ ".inst 0xc16ec9f4 // fclamp { z20.h-z23.h }, z15.h, z14.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ ".inst 0xc16ec9fc // fclamp { z28.h-z31.h }, z15.h, z14.h\n"
+ "st1h { z22.h }, p0, [x24]\n"
+ "st1h { z24.h }, p0, [x26]\n"
+ "st1h { z20.h }, p0, [x26, x27, LSL #1]\n"
+ "st1h { z21.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z23.h }, p0, [x24, x27, LSL #1]\n"
+ "st1h { z28.h }, p0, [x24, x25, LSL #1]\n"
+ "st1h { z29.h }, p0, [x23]\n"
+ "st1h { z30.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z31.h }, p0, [x23, x25, LSL #1]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
@@ -480,4 +480,4 @@ void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index e85cb9e017..acf66316ea 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -93,344 +93,344 @@ void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"mov x15, #0x0\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ldp x14, x13, [x16, #0x0]\n"
- "ldp x12, x11, [x16, #0x10]\n"
- "cnth x10\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "cnth x14\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1h { z17.h }, p3/Z, [x17]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1h { z30.h }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "ldr x9, [x16, #0x20]\n"
- "cmp x10, %x[n_channels]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "cmp x14, %x[n_channels]\n"
".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "sub x27, XZR, x10\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "sub x12, XZR, x14\n"
".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z8.h }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "inch x27\n"
- "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x25, [x16, #0x38]\n"
+ "movprfx z31, z30\n fmla z31.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z30\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x16, #0x30]\n"
+ "inch x12\n"
+ "movprfx z25, z30\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x27, [x16, #0x38]\n"
"mov p1.b, p2.b\n"
- "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x24, [x16, #0x28]\n"
- "whilelt p0.h, x10, %x[n_channels]\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ldr x13, [x16, #0x48]\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z20, z30\n fmla z20.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x16, #0x28]\n"
+ "whilelt p0.h, x14, %x[n_channels]\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z2.h, z9.h\n"
+ "movprfx z23, z30\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x16, #0x48]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
"fmla z24.h, p3/M, z4.h, z13.h\n"
- "ldr x14, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "ldr x12, [x16, #0x50]\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
- "fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x11, [x16, #0x58]\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "ldr x9, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
- "ld1h { z17.h }, p3/Z, [x17]\n"
+ "fmla z20.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x58]\n"
+ "fmla z21.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ld1h { z30.h }, p3/Z, [x17]\n"
"fmla z25.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ldr x14, [x16, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ldr x22, [x16, #0x70]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z17.h\n"
+ "ldr x21, [x16, #0x78]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z20.h, p3/M, z4.h, z19.h\n"
+ "ldr x20, [x16, #0x80]\n"
"addvl x17, x17, #1\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ldr x13, [x16, #0x88]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ldr x9, [x16, #0xa0]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "ldr x11, [x16, #0x98]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "ldr x14, [x16, #0xc0]\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ldr x9, [x16, #0x20]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldp x14, x13, [x16, #0x0]\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ldp x12, x11, [x16, #0x10]\n"
+ "fmla z31.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z0.h, z18.h\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z21.h, p3/M, z1.h, z17.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z1.h, z19.h\n"
+ "ldr x9, [x13, #0x0]\n"
+ "fmla z22.h, p3/M, z2.h, z19.h\n"
+ "ldr x28, [x13, #0x8]\n"
+ "fmla z31.h, p3/M, z1.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z9.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z17.h\n"
+ "fmla z25.h, p3/M, z7.h, z19.h\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ldr x26, [x13, #0x10]\n"
+ "fmla z20.h, p3/M, z2.h, z9.h\n"
+ "ldr x25, [x13, #0x18]\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z31.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z25.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z7.h, z29.h\n"
+ "fmla z20.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z22.h, p3/M, z4.h, z29.h\n"
+ "fmla z21.h, p3/M, z5.h, z29.h\n"
+ "fmla z23.h, p3/M, z3.h, z29.h\n"
+ "fmla z26.h, p3/M, z8.h, z29.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "fmla z31.h, p3/M, z4.h, z17.h\n"
+ "fmla z20.h, p3/M, z8.h, z18.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z13.h }, p2/Z, [x10, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z16.h\n"
+ "fmla z25.h, p3/M, z4.h, z16.h\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z28.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x20]\n"
+ "fmla z22.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z17.h\n"
+ "fmla z23.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z28.h\n"
+ "fmla z24.h, p3/M, z1.h, z28.h\n"
+ "fmla z27.h, p3/M, z6.h, z17.h\n"
+ "fmla z25.h, p3/M, z0.h, z28.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
+ "fmla z27.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldp x23, x22, [x16, #0x0]\n"
+ "fmla z23.h, p3/M, z2.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "ldp x21, x20, [x16, #0x10]\n"
"inch x15\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z20.h, p3/M, z5.h, z18.h\n"
".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z9.h }, p0/Z, [x23, x14, LSL #1]\n"
"whilelt p2.h, x15, %x[n_channels]\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- ".inst 0xc170ca58 // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
- "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
- "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "ld1h { z13.h }, p0/Z, [x9, x10, LSL #1]\n"
- "inch x10\n"
- "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z10.h }, p0/Z, [x22, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z11.h }, p0/Z, [x21, x14, LSL #1]\n"
+ ".inst 0xc16ec9f8 // fclamp { z24.h-z27.h }, z15.h, z14.h\n"
+ "ld1h { z12.h }, p0/Z, [x20, x14, LSL #1]\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "ld1h { z13.h }, p0/Z, [x24, x14, LSL #1]\n"
+ "inch x14\n"
".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
"addvl x17, x17, #4\n"
- "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "cmp x10, %x[n_channels]\n"
- ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+ "cmp x14, %x[n_channels]\n"
+ ".inst 0xc16ec9f4 // fclamp { z20.h-z23.h }, z15.h, z14.h\n"
"ld1h { z8.h }, p3/Z, [x17]\n"
"addvl x17, x17, #1\n"
- "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
- "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
- "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
- "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+ "st1h { z24.h }, p1, [x28, x12, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "st1h { z31.h }, p1, [x9, x12, LSL #1]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1h { z25.h }, p1, [x26, x12, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "st1h { z26.h }, p1, [x25, x12, LSL #1]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "st1h { z27.h }, p1, [x20, x12, LSL #1]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "st1h { z20.h }, p1, [x23, x12, LSL #1]\n"
+ "st1h { z21.h }, p1, [x22, x12, LSL #1]\n"
+ "st1h { z22.h }, p1, [x21, x12, LSL #1]\n"
+ "st1h { z23.h }, p1, [x20, x12, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
- "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "inch x27\n"
- "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
- "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "ldr x25, [x16, #0x38]\n"
- "mov p1.b, p2.b\n"
- "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
- "ldr x24, [x16, #0x28]\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ldr x13, [x16, #0x48]\n"
- "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "movprfx z20, z30\n fmla z20.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z30\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x16, #0x30]\n"
+ "inch x12\n"
+ "movprfx z25, z30\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x27, [x16, #0x38]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z30\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x16, #0x28]\n"
+ "movprfx z29, z30\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z30\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x16, #0x48]\n"
+ "fmla z20.h, p3/M, z0.h, z10.h\n"
"fmla z24.h, p3/M, z4.h, z13.h\n"
- "ldr x14, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z25.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "ldr x12, [x16, #0x50]\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z27.h, p3/M, z1.h, z13.h\n"
"fmla z28.h, p3/M, z0.h, z13.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x11, [x16, #0x58]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x58]\n"
"fmla z29.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
- "ldr x9, [x16, #0x60]\n"
- "fmla z23.h, p3/M, z5.h, z13.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ldr x24, [x16, #0x68]\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "fmla z20.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ldr x23, [x16, #0x68]\n"
"fmla z25.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z11.h\n"
- "ldr x26, [x16, #0x70]\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
- "ldr x25, [x16, #0x78]\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z28.h, p3/M, z4.h, z10.h\n"
- "ldr x14, [x16, #0x80]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ldr x13, [x16, #0x88]\n"
- "fmla z29.h, p3/M, z1.h, z11.h\n"
- "fmla z25.h, p3/M, z1.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x90]\n"
- "fmla z27.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ldr x23, [x28, #0x0]\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla z23.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z0.h, z11.h\n"
- "ldr x9, [x16, #0xa0]\n"
- "fmla z25.h, p3/M, z7.h, z10.h\n"
- "ldr x11, [x16, #0x98]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ldr x21, [x28, #0x10]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
- "fmla z24.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x24, [x16, #0xa8]\n"
- "fmla z25.h, p3/M, z5.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmla z29.h, p3/M, z5.h, z10.h\n"
- "ldr x14, [x16, #0xc0]\n"
- "fmla z31.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "ldr x20, [x28, #0x18]\n"
- "fmla z24.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z27.h, p3/M, z0.h, z12.h\n"
- "fmla z30.h, p3/M, z6.h, z13.h\n"
- "fmla z29.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "fmla z25.h, p3/M, z4.h, z11.h\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z29.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "fmla z29.h, p3/M, z0.h, z12.h\n"
- "fmla z27.h, p3/M, z8.h, z13.h\n"
- "fmla z26.h, p3/M, z3.h, z12.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z30.h, p3/M, z5.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmax z23.h, p3/M, z23.h, z18.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z13.h\n"
- ".inst 0xc170ca58 // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "fmla z31.h, p3/M, z6.h, z13.h\n"
- "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
- "ldr x22, [x28, #0x28]\n"
- "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
- "ldr x21, [x28, #0x30]\n"
- "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
- "ldr x20, [x28, #0x38]\n"
- "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
- "ldr x23, [x28, #0x20]\n"
- ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
- "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
- "ldr x23, [x28, #0x40]\n"
- "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
- "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
- "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
- "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z19.h\n"
+ "ldr x22, [x16, #0x70]\n"
+ "fmla z31.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z19.h\n"
+ "ldr x21, [x16, #0x78]\n"
+ "fmla z30.h, p3/M, z0.h, z19.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla z20.h, p3/M, z7.h, z19.h\n"
+ "fmla z24.h, p3/M, z0.h, z17.h\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z29.h, p3/M, z1.h, z19.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z5.h, z18.h\n"
+ "fmla z31.h, p3/M, z1.h, z18.h\n"
+ "ldr x9, [x13, #0x0]\n"
+ "fmla z30.h, p3/M, z2.h, z18.h\n"
+ "ldr x28, [x13, #0x8]\n"
+ "fmla z20.h, p3/M, z1.h, z17.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "ldr x25, [x13, #0x10]\n"
+ "fmla z28.h, p3/M, z2.h, z17.h\n"
+ "ldr x24, [x13, #0x18]\n"
+ "fmla z24.h, p3/M, z8.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z20.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z7.h, z18.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z30.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z18.h\n"
+ "fmla z31.h, p3/M, z3.h, z18.h\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "fmla z29.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z16.h\n"
+ "fmla z25.h, p3/M, z4.h, z16.h\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z7.h, z19.h\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z31.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z27.h, p3/M, z6.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "fmla z20.h, p3/M, z6.h, z17.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z27.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z28.h, p3/M, z5.h, z18.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmla z29.h, p3/M, z8.h, z16.h\n"
+ "fmla z30.h, p3/M, z7.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z16.h\n"
+ ".inst 0xc16ec9f8 // fclamp { z24.h-z27.h }, z15.h, z14.h\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ ".inst 0xc16ec9fc // fclamp { z28.h-z31.h }, z15.h, z14.h\n"
+ "st1h { z24.h }, p0, [x28, x12, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "st1h { z20.h }, p0, [x9, x12, LSL #1]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1h { z25.h }, p0, [x25, x12, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "st1h { z26.h }, p0, [x24, x12, LSL #1]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "st1h { z27.h }, p0, [x20, x12, LSL #1]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "st1h { z28.h }, p0, [x23, x12, LSL #1]\n"
+ "st1h { z29.h }, p0, [x22, x12, LSL #1]\n"
+ "st1h { z30.h }, p0, [x21, x12, LSL #1]\n"
+ "st1h { z31.h }, p0, [x20, x12, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
@@ -441,4 +441,4 @@ void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 6b75d12295..f06fb72d31 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
namespace arm_conv {
namespace depthwise {
@@ -65,3 +67,5 @@ class sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 37a9febf47..0d1151ae6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -88,98 +88,98 @@ void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "mov x1, #0x0\n"
"mov x2, #0x0\n"
+ "mov x3, #0x0\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
"1:" // Tile loop
- "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x22, #0x4\n"
- "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "mul x20, x1, x21\n" // offset = tile_i * ld_input_row
- "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x2, x3, x20\n" // offset += tile_j * ld_input_col
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "add x7, x4, x4\n"
"mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "add x6, x3, x3\n"
- "add x4, x4, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x7, x4, x21, LSL #1\n"
- "add x8, x6, x3\n"
- "add x17, x7, x21, LSL #1\n"
- "add x16, x8, x3\n"
- "add x15, x17, x21, LSL #1\n"
- "add x14, x16, x3\n"
- "add x13, x15, x21, LSL #1\n"
+ "add x8, x7, x4\n"
+ "add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #1\n"
+ "add x15, x17, x4\n"
+ "add x14, x16, x21, LSL #1\n"
+ "add x13, x14, x21, LSL #1\n"
"add x12, x13, x21, LSL #1\n"
- "cbnz x2, 2f\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x11, x12, x21, LSL #1\n"
+ "cbnz x3, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"lsl x10, %x[n_channels], #0x1\n"
"mov x21, #0x8\n"
- "mul x21, x21, x3\n"
- "add x9, x17, x6, LSL #1\n"
- "add x28, x4, x14, LSL #1\n"
- "add x27, x17, x8, LSL #1\n"
- "sub x20, x11, x2\n"
- "add x26, x12, x14, LSL #1\n"
+ "mul x21, x21, x4\n"
+ "add x9, x14, x7, LSL #1\n"
+ "add x28, x5, x15, LSL #1\n"
+ "add x27, x14, x8, LSL #1\n"
+ "sub x20, x20, x3\n"
+ "add x26, x11, x15, LSL #1\n"
"sub x20, x20, #0x1\n"
- "add x25, x15, x6, LSL #1\n"
+ "add x25, x13, x7, LSL #1\n"
"and x20, x20, #0x3fffff\n"
- "add x24, x4, x3, LSL #1\n"
+ "add x24, x5, x4, LSL #1\n"
"orr x10, x10, x20, LSL #22\n"
- "add x23, x4, x16, LSL #1\n"
+ "add x23, x5, x17, LSL #1\n"
"orr x10, x10, x21, LSL #38\n"
- "add x22, x15, x8, LSL #1\n"
- "add x21, x7, x14, LSL #1\n"
- "add x20, x7, x6, LSL #1\n"
+ "add x22, x13, x8, LSL #1\n"
+ "add x21, x16, x15, LSL #1\n"
+ "add x20, x16, x7, LSL #1\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- "add x9, x13, x14, LSL #1\n"
- ".inst 0xf8aa489a // rprfm pldonce, x10, [x4]\n"
+ "add x9, x12, x15, LSL #1\n"
+ ".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
- "add x28, x7, x8, LSL #1\n"
+ "add x28, x16, x8, LSL #1\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x12, x3, LSL #1\n"
- ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ "add x27, x11, x4, LSL #1\n"
+ ".inst 0xf8aa497a // rprfm pldonce, x10, [x11]\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x17, x3, LSL #1\n"
+ "add x26, x14, x4, LSL #1\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x12, x16, LSL #1\n"
+ "add x25, x11, x17, LSL #1\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
- "add x24, x17, x16, LSL #1\n"
+ "add x24, x14, x17, LSL #1\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
- "add x23, x4, x6, LSL #1\n"
+ "add x23, x5, x7, LSL #1\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x15, x3, LSL #1\n"
- ".inst 0xf8aa48fa // rprfm pldonce, x10, [x7]\n"
+ "add x22, x13, x4, LSL #1\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x4, x8, LSL #1\n"
- ".inst 0xf8aa49ba // rprfm pldonce, x10, [x13]\n"
+ "add x21, x5, x8, LSL #1\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x15, x16, LSL #1\n"
+ "add x20, x13, x17, LSL #1\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- "add x9, x17, x14, LSL #1\n"
+ "add x9, x14, x15, LSL #1\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
- "add x28, x13, x6, LSL #1\n"
+ "add x28, x12, x7, LSL #1\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x15, x14, LSL #1\n"
+ "add x27, x13, x15, LSL #1\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x12, x6, LSL #1\n"
+ "add x26, x11, x7, LSL #1\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x13, x8, LSL #1\n"
+ "add x25, x12, x8, LSL #1\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
- "add x24, x12, x8, LSL #1\n"
+ "add x24, x11, x8, LSL #1\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
- "add x23, x7, x3, LSL #1\n"
+ "add x23, x16, x4, LSL #1\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x7, x16, LSL #1\n"
+ "add x22, x16, x17, LSL #1\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x13, x3, LSL #1\n"
- ".inst 0xf8aa4a3a // rprfm pldonce, x10, [x17]\n"
+ "add x21, x12, x4, LSL #1\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x13, x16, LSL #1\n"
+ "add x20, x12, x17, LSL #1\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- ".inst 0xf8aa49fa // rprfm pldonce, x10, [x15]\n"
+ ".inst 0xf8aa49ba // rprfm pldonce, x10, [x13]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
@@ -190,483 +190,483 @@ void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x21, #0x4\n"
- "ld1h { z15.h }, p3/Z, [x5]\n"
- "addvl x5, x5, #1\n"
+ "ld1h { z14.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
"ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x28\n"
- ".inst 0xa040a0a0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
- "addvl x5, x5, #4\n"
- "ldr x27, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cnth x22\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- ".inst 0xa040a0a4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
- "addvl x5, x5, #4\n"
- "mul x20, x1, x22\n" // offset = tile_i * ld_output_row
- "cmp x28, %x[n_channels]\n"
- "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "madd x20, x2, x9, x20\n" // offset += tile_j * ld_output_col
- "add x26, x9, x9\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x20, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x22, %x[n_channels]\n"
+ "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x20, x3, x9, x20\n" // offset += tile_j * ld_output_col
+ "add x27, x9, x9\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mul x20, x20, x21\n" // offset *= output_tile_size
- "add x25, x26, x9\n"
- "ld1h { z8.h }, p3/Z, [x5]\n"
- "add x27, x27, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x26, x27, x9\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"mov x21, #0x0\n"
- "ld1h { z9.h }, p2/Z, [x17, x6, LSL #1]\n"
- "add x24, x27, x22, LSL #1\n"
- "sub x20, XZR, x28\n"
- "ld1h { z10.h }, p2/Z, [x4]\n"
- "add x23, x24, x22, LSL #1\n"
- "ld1h { z11.h }, p2/Z, [x4, x14, LSL #1]\n"
- "addvl x5, x5, #1\n"
- "add x22, x23, x22, LSL #1\n"
- "ld1h { z12.h }, p2/Z, [x17, x8, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "add x25, x28, x23, LSL #1\n"
+ "sub x20, XZR, x22\n"
+ "ld1h { z10.h }, p2/Z, [x5]\n"
+ "add x24, x25, x23, LSL #1\n"
+ "ld1h { z11.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "addvl x6, x6, #1\n"
+ "add x23, x24, x23, LSL #1\n"
+ "ld1h { z12.h }, p2/Z, [x14, x8, LSL #1]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x28, %x[n_channels]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x22, %x[n_channels]\n"
"inch x21\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "inch x28\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z3.h, z9.h\n"
+ "movprfx z17, z14\n fmla z17.h, p3/M, z1.h, z9.h\n"
+ "inch x22\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "movprfx z18, z14\n fmla z18.h, p3/M, z0.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
"inch x20\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ld1h { z15.h }, p3/Z, [x5]\n"
- "addvl x5, x5, #1\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "ld1h { z9.h }, p2/Z, [x7]\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
- "addvl x4, x4, #1\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x17]\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "movprfx z30, z14\n fmla z30.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "movprfx z16, z14\n fmla z16.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "movprfx z31, z14\n fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z19.h }, p2/Z, [x11]\n"
+ "fmla z26.h, p3/M, z4.h, z12.h\n"
"fmla z17.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z22.h }, p2/Z, [x11, x15, LSL #1]\n"
"fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
- "addvl x17, x17, #1\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z6.h, z19.h\n"
+ "fmla z25.h, p3/M, z7.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z3.h, z12.h\n"
+ "movprfx z19, z14\n fmla z19.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z8.h, z22.h\n"
+ "fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z9.h\n"
+ "fmla z18.h, p3/M, z3.h, z9.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z14.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "fmla z24.h, p3/M, z8.h, z9.h\n"
+ "fmla z16.h, p3/M, z5.h, z9.h\n"
+ "fmla z20.h, p3/M, z2.h, z9.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x16]\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12]\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "fmla z17.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "fmla z21.h, p3/M, z2.h, z11.h\n"
+ "fmla z22.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z9.h\n"
+ "fmla z16.h, p3/M, z6.h, z12.h\n"
+ "fmla z20.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
"fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x8, LSL #1]\n"
"fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "ld1h { z9.h }, p1/Z, [x17, x6, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
"fmla z19.h, p3/M, z8.h, z12.h\n"
"fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
- "addvl x15, x15, #1\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
- "addvl x12, x12, #1\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z10.h\n"
"fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
- "addvl x7, x7, #1\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z20.h, p3/M, z7.h, z12.h\n"
+ "fmla z21.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z9.h\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z7.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z5.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z19.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z25.h, p3/M, z6.h, z11.h\n"
+ "fmla z16.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "fmla z20.h, p3/M, z1.h, z11.h\n"
+ "fmla z21.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x14]\n"
+ "fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "fmla z27.h, p3/M, z7.h, z11.h\n"
"fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
"fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
"fmla z23.h, p3/M, z1.h, z11.h\n"
- "cmp x28, %x[n_channels]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13]\n"
+ "fmla z21.h, p3/M, z4.h, z11.h\n"
+ "fmla z22.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z19.h, p3/M, z2.h, z12.h\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
"addvl x13, x13, #1\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z11.h }, p1/Z, [x4, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- ".inst 0xa040a0a0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
- "addvl x5, x5, #4\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "ld1h { z12.h }, p1/Z, [x17, x8, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- ".inst 0xa040a0a4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
- "addvl x5, x5, #4\n"
- ".inst 0xc16dc9d0 // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
- ".inst 0xc16dc9d4 // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
- "ld1h { z10.h }, p1/Z, [x4]\n"
- "ld1h { z8.h }, p3/Z, [x5]\n"
- "addvl x5, x5, #1\n"
- ".inst 0xc16dc9d8 // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
- ".inst 0xc16dc9dc // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
- "st1h { z16.h }, p0, [x27]\n"
- "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
- "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
- "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
- "addvl x27, x27, #1\n"
- "st1h { z20.h }, p0, [x24]\n"
- "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
- "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
- "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+ "fmla z16.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z11.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z8.h, z12.h\n"
+ "fmla z21.h, p3/M, z7.h, z10.h\n"
+ "fmla z22.h, p3/M, z6.h, z10.h\n"
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "fmla z20.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x8, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z21.h, p3/M, z5.h, z11.h\n"
+ "fmla z22.h, p3/M, z4.h, z11.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z20.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z11.h\n"
+ "fmla z18.h, p3/M, z7.h, z11.h\n"
+ "fmla z19.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z21.h, p3/M, z8.h, z12.h\n"
+ "fmla z22.h, p3/M, z7.h, z12.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z11.h\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "fmla z31.h, p3/M, z4.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z11.h\n"
+ "cmp x22, %x[n_channels]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z11.h }, p1/Z, [x5, x15, LSL #1]\n"
+ "fmla z17.h, p3/M, z6.h, z12.h\n"
+ "fmla z20.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z3.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z10.h\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "fmla z19.h, p3/M, z7.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z12.h }, p1/Z, [x14, x8, LSL #1]\n"
+ "fmla z23.h, p3/M, z4.h, z10.h\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ ".inst 0xc16fc9bc // fclamp { z28.h-z31.h }, z13.h, z15.h\n"
+ ".inst 0xc16fc9b8 // fclamp { z24.h-z27.h }, z13.h, z15.h\n"
+ "ld1h { z10.h }, p1/Z, [x5]\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ ".inst 0xc16fc9b0 // fclamp { z16.h-z19.h }, z13.h, z15.h\n"
+ ".inst 0xc16fc9b4 // fclamp { z20.h-z23.h }, z13.h, z15.h\n"
+ "st1h { z28.h }, p0, [x28]\n"
+ "st1h { z29.h }, p0, [x28, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x28, x27, LSL #1]\n"
+ "st1h { z31.h }, p0, [x28, x26, LSL #1]\n"
+ "addvl x28, x28, #1\n"
+ "st1h { z24.h }, p0, [x25]\n"
+ "st1h { z25.h }, p0, [x25, x9, LSL #1]\n"
+ "st1h { z26.h }, p0, [x25, x27, LSL #1]\n"
+ "st1h { z27.h }, p0, [x25, x26, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "st1h { z16.h }, p0, [x24]\n"
+ "st1h { z17.h }, p0, [x24, x9, LSL #1]\n"
+ "st1h { z18.h }, p0, [x24, x27, LSL #1]\n"
+ "st1h { z19.h }, p0, [x24, x26, LSL #1]\n"
"addvl x24, x24, #1\n"
- "st1h { z24.h }, p0, [x23]\n"
- "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
- "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
- "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z20.h }, p0, [x23]\n"
+ "st1h { z21.h }, p0, [x23, x9, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x26, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z28.h }, p0, [x22]\n"
- "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
- "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"mov p0.b, p2.b\n"
- "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z7.h, z9.h\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "add x2, x2, #0x1\n"
+ "add x3, x3, #0x1\n"
"fmla z21.h, p3/M, z5.h, z12.h\n"
- "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
- "add x20, x1, #0x1\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "cmp x2, x11\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "add x20, x2, #0x1\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x11]\n"
+ "cmp x3, x22\n"
"fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
- "csel x1, x1, x20, LT\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "csel x2, x2, XZR, LT\n"
- "cmp x1, x21\n"
- "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "csel x2, x2, x20, LT\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "fmla z25.h, p3/M, z8.h, z12.h\n"
+ "csel x3, x3, XZR, LT\n"
+ "cmp x2, x21\n"
+ "movprfx z16, z14\n fmla z16.h, p3/M, z6.h, z17.h\n"
"fmla z21.h, p3/M, z7.h, z9.h\n"
- "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
- "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z31, z14\n fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "movprfx z19, z14\n fmla z19.h, p3/M, z8.h, z18.h\n"
"fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "movprfx z17, z14\n fmla z17.h, p3/M, z1.h, z9.h\n"
+ "movprfx z18, z14\n fmla z18.h, p3/M, z0.h, z9.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "fmla z16.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z14.h }, p2/Z, [x16]\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12]\n"
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z19.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "fmla z20.h, p3/M, z0.h, z14.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "fmla z16.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z10.h\n"
"fmla z25.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x8, LSL #1]\n"
"fmla z26.h, p3/M, z3.h, z9.h\n"
- "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "ld1h { z9.h }, p2/Z, [x7]\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z17.h, p3/M, z4.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x4, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z9.h\n"
"fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z12.h\n"
+ "fmla z27.h, p3/M, z3.h, z12.h\n"
"fmla z22.h, p3/M, z1.h, z12.h\n"
"fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n"
"fmla z16.h, p3/M, z7.h, z10.h\n"
"fmla z17.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z9.h\n"
+ "fmla z21.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z7.h, z9.h\n"
+ "fmla z25.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "fmla z18.h, p3/M, z8.h, z11.h\n"
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z14.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z12.h\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
"fmla z22.h, p3/M, z5.h, z12.h\n"
"fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x17]\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "fmla z20.h, p3/M, z7.h, z14.h\n"
+ "fmla z21.h, p3/M, z6.h, z14.h\n"
+ "fmla z28.h, p3/M, z4.h, z14.h\n"
+ "fmla z29.h, p3/M, z3.h, z14.h\n"
+ "fmla z16.h, p3/M, z1.h, z14.h\n"
+ "fmla z17.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z25.h, p3/M, z1.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14]\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z2.h, z14.h\n"
+ "fmla z22.h, p3/M, z8.h, z14.h\n"
+ "fmla z23.h, p3/M, z7.h, z14.h\n"
+ "fmla z30.h, p3/M, z5.h, z14.h\n"
"fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x15]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "fmla z19.h, p3/M, z1.h, z14.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "fmla z26.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z14.h }, p2/Z, [x13]\n"
+ "fmla z17.h, p3/M, z4.h, z11.h\n"
+ "fmla z18.h, p3/M, z3.h, z11.h\n"
+ "fmla z27.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
"fmla z31.h, p3/M, z2.h, z12.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "fmla z29.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "fmla z20.h, p3/M, z6.h, z14.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "fmla z16.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "fmla z19.h, p3/M, z2.h, z9.h\n"
+ "fmla z23.h, p3/M, z8.h, z9.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z18.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z14.h }, p2/Z, [x11, x8, LSL #1]\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z18.h, p3/M, z4.h, z10.h\n"
+ "fmla z19.h, p3/M, z3.h, z10.h\n"
+ "fmla z16.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z14.h\n"
+ "fmla z18.h, p3/M, z7.h, z14.h\n"
+ "fmla z19.h, p3/M, z6.h, z14.h\n"
+ "fmla z24.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
"fmla z22.h, p3/M, z2.h, z11.h\n"
"fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- ".inst 0xc16dc9d0 // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
- ".inst 0xc16dc9d4 // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
- ".inst 0xc16dc9d8 // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
- ".inst 0xc16dc9dc // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
- "st1h { z16.h }, p0, [x27]\n"
- "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
- "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
- "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
- "st1h { z20.h }, p0, [x24]\n"
- "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
- "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
- "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
- "st1h { z24.h }, p0, [x23]\n"
- "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
- "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
- "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z28.h }, p0, [x22]\n"
- "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
- "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z10.h\n"
+ "fmla z29.h, p3/M, z6.h, z10.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmla z18.h, p3/M, z5.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ ".inst 0xc16fc9b8 // fclamp { z24.h-z27.h }, z13.h, z15.h\n"
+ ".inst 0xc16fc9b4 // fclamp { z20.h-z23.h }, z13.h, z15.h\n"
+ ".inst 0xc16fc9bc // fclamp { z28.h-z31.h }, z13.h, z15.h\n"
+ ".inst 0xc16fc9b0 // fclamp { z16.h-z19.h }, z13.h, z15.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "st1h { z25.h }, p0, [x28, x9, LSL #1]\n"
+ "st1h { z26.h }, p0, [x28, x27, LSL #1]\n"
+ "st1h { z27.h }, p0, [x28, x26, LSL #1]\n"
+ "st1h { z20.h }, p0, [x25]\n"
+ "st1h { z21.h }, p0, [x25, x9, LSL #1]\n"
+ "st1h { z22.h }, p0, [x25, x27, LSL #1]\n"
+ "st1h { z23.h }, p0, [x25, x26, LSL #1]\n"
+ "st1h { z28.h }, p0, [x24]\n"
+ "st1h { z29.h }, p0, [x24, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x24, x27, LSL #1]\n"
+ "st1h { z31.h }, p0, [x24, x26, LSL #1]\n"
+ "st1h { z16.h }, p0, [x23]\n"
+ "st1h { z17.h }, p0, [x23, x9, LSL #1]\n"
+ "st1h { z18.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z19.h }, p0, [x23, x26, LSL #1]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 2e6f1123a4..7430ff89ed 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -98,556 +98,556 @@ void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "mov x15, #0x0\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x16, #0x0\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ldp x14, x13, [x16, #0x0]\n"
- "ldp x12, x11, [x16, #0x10]\n"
- "cnth x10\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "cnth x15\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1h { z14.h }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "cmp x10, %x[n_channels]\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "sub x28, XZR, x10\n"
- ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z8.h }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1h { z13.h }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "cmp x15, %x[n_channels]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "sub x13, XZR, x15\n"
+ ".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z8.h }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "inch x28\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
+ "movprfx z25, z13\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z13\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x24, [x17, #0x20]\n"
+ "inch x13\n"
+ "movprfx z26, z13\n fmla z26.h, p3/M, z3.h, z9.h\n"
+ "movprfx z29, z13\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [x17, #0x30]\n"
"mov p1.b, p2.b\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "whilelt p0.h, x10, %x[n_channels]\n"
- "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "ldr x24, [x16, #0x38]\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x14, [x16, #0x40]\n"
+ "movprfx z30, z13\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z17, z13\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "whilelt p0.h, x15, %x[n_channels]\n"
+ "movprfx z18, z13\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z13\n fmla z24.h, p3/M, z5.h, z9.h\n"
+ "ldr x23, [x17, #0x38]\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x40]\n"
"fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x13, [x16, #0x48]\n"
- "fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x12, [x16, #0x50]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "movprfx z19, z13\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z26.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z22.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x50]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
"fmla z17.h, p3/M, z8.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
+ "ldr x26, [x17, #0x60]\n"
"fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x25, [x16, #0x68]\n"
- "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "movprfx z20, z13\n fmla z20.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z25.h, p3/M, z7.h, z9.h\n"
"fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x11, [x16, #0x58]\n"
- "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x14, [x16, #0x80]\n"
- "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ldr x13, [x16, #0x88]\n"
- "ld1h { z14.h }, p3/Z, [x17]\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "ldr x23, [x9, #0x0]\n"
- "addvl x17, x17, #1\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x90]\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ldr x11, [x16, #0x98]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "ldr x22, [x9, #0x8]\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "ldr x21, [x9, #0x10]\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z27, z13\n fmla z27.h, p3/M, z3.h, z12.h\n"
+ "movprfx z31, z13\n fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z23, z13\n fmla z23.h, p3/M, z8.h, z22.h\n"
+ "fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z21, z13\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "movprfx z22, z13\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ld1h { z13.h }, p3/Z, [x8]\n"
+ "fmla z24.h, p3/M, z8.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z20.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z17.h, p3/M, z0.h, z11.h\n"
+ "fmla z18.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x98]\n"
+ "fmla z25.h, p3/M, z8.h, z10.h\n"
+ "fmla z19.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z26.h, p3/M, z7.h, z10.h\n"
+ "fmla z27.h, p3/M, z6.h, z10.h\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z2.h, z10.h\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z22.h, p3/M, z1.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
"fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
+ "fmla z24.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "fmla z20.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
"fmla z17.h, p3/M, z4.h, z10.h\n"
"fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x14, [x16, #0xc0]\n"
+ "fmla z25.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
"fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x13, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ldr x11, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla z17.h, p3/M, z5.h, z11.h\n"
+ "fmla z18.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z3.h, z11.h\n"
+ "fmla z26.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z20.h, p3/M, z7.h, z9.h\n"
+ "fmla z21.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z16.h, p3/M, z7.h, z12.h\n"
+ "fmla z17.h, p3/M, z6.h, z12.h\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z18.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z8.h, z9.h\n"
+ "fmla z23.h, p3/M, z7.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xf0]\n"
"fmla z16.h, p3/M, z2.h, z10.h\n"
"fmla z17.h, p3/M, z1.h, z10.h\n"
"fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "ldr x14, [x16, #0x100]\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x13, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "fmla z21.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "ldr x22, [x17, #0x100]\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z19.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z16.h, p3/M, z6.h, z9.h\n"
+ "fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z2.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x110]\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x16, LSL #1]\n"
"fmla z19.h, p3/M, z8.h, z12.h\n"
- "ldr x11, [x16, #0x118]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z23.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "ldr x21, [x17, #0x118]\n"
"fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z0.h, z11.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z21.h, p3/M, z4.h, z10.h\n"
+ "fmla z22.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x16, LSL #1]\n"
"fmla z29.h, p3/M, z7.h, z10.h\n"
"fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z5.h, z10.h\n"
"fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "ldp x14, x13, [x16, #0x0]\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z7.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z11.h\n"
+ "fmla z20.h, p3/M, z8.h, z11.h\n"
"fmla z29.h, p3/M, z8.h, z12.h\n"
- "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
"fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x16, LSL #1]\n"
"fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldp x12, x11, [x16, #0x10]\n"
- "fmla z22.h, p3/M, z2.h, z11.h\n"
- "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z3.h, z12.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldp x20, x22, [x17, #0x0]\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z9.h }, p0/Z, [x20, x15, LSL #1]\n"
+ "fmla z21.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z8.h, z10.h\n"
+ "inch x16\n"
+ ".inst 0xc16fc9d0 // fclamp { z16.h-z19.h }, z14.h, z15.h\n"
+ "fmla z31.h, p3/M, z7.h, z10.h\n"
+ "whilelt p2.h, x16, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z7.h, z0.h\n"
+ "fmla z29.h, p3/M, z6.h, z0.h\n"
+ "ld1h { z11.h }, p0/Z, [x21, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z0.h\n"
+ "fmla z21.h, p3/M, z3.h, z0.h\n"
+ "ld1h { z12.h }, p0/Z, [x20, x15, LSL #1]\n"
+ ".inst 0xc16fc9d8 // fclamp { z24.h-z27.h }, z14.h, z15.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ ".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1h { z16.h }, p1, [x12, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z23.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "st1h { z17.h }, p1, [x11, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x28]\n"
+ ".inst 0xc16fc9dc // fclamp { z28.h-z31.h }, z14.h, z15.h\n"
"inch x15\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- "whilelt p2.h, x15, %x[n_channels]\n"
- ".inst 0xc16dc9f0 // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- ".inst 0xc16dc9f4 // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
- "inch x10\n"
- "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x20]\n"
- ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x28]\n"
- ".inst 0xc16dc9f8 // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
- ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
- "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x30]\n"
- "addvl x17, x17, #4\n"
- "cmp x10, %x[n_channels]\n"
- "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x38]\n"
- ".inst 0xc16dc9fc // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
- "ld1h { z8.h }, p3/Z, [x17]\n"
- "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x40]\n"
- "addvl x17, x17, #1\n"
- "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x48]\n"
- "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x50]\n"
- "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x58]\n"
- "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x60]\n"
- "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x68]\n"
- "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x70]\n"
- "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x78]\n"
- "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
- "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
- "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
- "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+ "st1h { z18.h }, p1, [x10, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ ".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1h { z19.h }, p1, [x9, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z8.h }, p3/Z, [x8]\n"
+ "st1h { z24.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ ".inst 0xc16fc9d4 // fclamp { z20.h-z23.h }, z14.h, z15.h\n"
+ "addvl x8, x8, #1\n"
+ "st1h { z25.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "st1h { z26.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "st1h { z27.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "st1h { z28.h }, p1, [x23, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "st1h { z29.h }, p1, [x22, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "st1h { z30.h }, p1, [x21, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1h { z31.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "st1h { z20.h }, p1, [x23, x13, LSL #1]\n"
+ "st1h { z21.h }, p1, [x22, x13, LSL #1]\n"
+ "st1h { z22.h }, p1, [x21, x13, LSL #1]\n"
+ "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
- "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
- "ldr x27, [x16, #0x20]\n"
- "inch x28\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "ldr x26, [x16, #0x30]\n"
- "mov p1.b, p2.b\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
- "ldr x25, [x16, #0x28]\n"
- "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
- "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
- "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
- "ldr x24, [x16, #0x38]\n"
+ "movprfx z21, z13\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z13\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "ldr x24, [x17, #0x20]\n"
+ "inch x13\n"
+ "movprfx z22, z13\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z29, z13\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z30, z13\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ldr x23, [x17, #0x28]\n"
+ "movprfx z25, z13\n fmla z25.h, p3/M, z7.h, z9.h\n"
+ "movprfx z26, z13\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z13\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "ldr x22, [x17, #0x38]\n"
"fmla z21.h, p3/M, z5.h, z12.h\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x14, [x16, #0x40]\n"
- "fmla z16.h, p3/M, z0.h, z10.h\n"
- "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x13, [x16, #0x48]\n"
+ "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x40]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z13\n fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x48]\n"
"fmla z22.h, p3/M, z4.h, z12.h\n"
- "fmla z25.h, p3/M, z2.h, z12.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x12, [x16, #0x50]\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "fmla z17.h, p3/M, z8.h, z12.h\n"
- "ldr x27, [x16, #0x60]\n"
- "fmla z18.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x25, [x16, #0x68]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x50]\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "fmla z25.h, p3/M, z8.h, z12.h\n"
+ "ldr x26, [x17, #0x60]\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "movprfx z16, z13\n fmla z16.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x25, [x17, #0x68]\n"
"fmla z21.h, p3/M, z7.h, z9.h\n"
- "fmla z19.h, p3/M, z6.h, z12.h\n"
- "ldr x11, [x16, #0x58]\n"
- "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x26, [x16, #0x70]\n"
- "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z23, z13\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z31, z13\n fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z19, z13\n fmla z19.h, p3/M, z8.h, z18.h\n"
"fmla z22.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x24, [x16, #0x78]\n"
- "fmla z25.h, p3/M, z4.h, z9.h\n"
- "fmla z26.h, p3/M, z3.h, z9.h\n"
- "ldr x14, [x16, #0x80]\n"
- "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
- "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
- "ldr x13, [x16, #0x88]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z17, z13\n fmla z17.h, p3/M, z1.h, z9.h\n"
+ "movprfx z18, z13\n fmla z18.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x17, #0x88]\n"
"fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z24.h, p3/M, z5.h, z9.h\n"
- "ldr x23, [x9, #0x0]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "fmla z16.h, p3/M, z1.h, z12.h\n"
- "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x90]\n"
- "fmla z17.h, p3/M, z0.h, z12.h\n"
- "fmla z18.h, p3/M, z2.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ldr x11, [x16, #0x98]\n"
- "fmla z21.h, p3/M, z8.h, z10.h\n"
- "fmla z19.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xa0]\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z23.h, p3/M, z6.h, z10.h\n"
- "ldr x22, [x9, #0x8]\n"
+ "fmla z28.h, p3/M, z5.h, z9.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "fmla z16.h, p3/M, z2.h, z9.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla z21.h, p3/M, z8.h, z11.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z22.h, p3/M, z7.h, z11.h\n"
+ "fmla z23.h, p3/M, z6.h, z11.h\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z19.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z24.h, p3/M, z3.h, z13.h\n"
+ "fmla z20.h, p3/M, z0.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z26.h, p3/M, z3.h, z11.h\n"
+ "fmla z21.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "fmla z19.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xc8]\n"
"fmla z25.h, p3/M, z5.h, z10.h\n"
"fmla z26.h, p3/M, z4.h, z10.h\n"
- "ldr x21, [x9, #0x10]\n"
+ "fmla z21.h, p3/M, z2.h, z10.h\n"
"fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z29.h, p3/M, z2.h, z10.h\n"
- "ldr x20, [x9, #0x18]\n"
- "fmla z30.h, p3/M, z1.h, z10.h\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z16.h, p3/M, z3.h, z9.h\n"
- "fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z22.h, p3/M, z1.h, z10.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z16.h, p3/M, z7.h, z9.h\n"
+ "fmla z17.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xd0]\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z25.h, p3/M, z6.h, z11.h\n"
+ "fmla z20.h, p3/M, z4.h, z11.h\n"
+ "fmla z21.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z26.h, p3/M, z8.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z13.h\n"
+ "fmla z19.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z24.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z21.h, p3/M, z6.h, z10.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z16.h, p3/M, z1.h, z10.h\n"
+ "fmla z17.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z24.h, p3/M, z6.h, z13.h\n"
+ "fmla z20.h, p3/M, z3.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "fmla z18.h, p3/M, z2.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "fmla z22.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z23.h, p3/M, z7.h, z9.h\n"
+ "fmla z30.h, p3/M, z5.h, z9.h\n"
+ "fmla z19.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z12.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z16.h, p3/M, z0.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
"fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xb0]\n"
+ "ld1h { z9.h }, p2/Z, [x26, x16, LSL #1]\n"
"fmla z17.h, p3/M, z4.h, z10.h\n"
"fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z19.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z2.h, z12.h\n"
- "fmla z22.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xb8]\n"
- "fmla z27.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ldr x14, [x16, #0xc0]\n"
- "fmla z16.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x13, [x16, #0xc8]\n"
- "fmla z17.h, p3/M, z5.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z21.h, p3/M, z2.h, z12.h\n"
- "fmla z19.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z1.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ldr x11, [x16, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z11.h\n"
- "fmla z29.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0xd0]\n"
- "fmla z16.h, p3/M, z7.h, z10.h\n"
- "fmla z17.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z21.h, p3/M, z3.h, z10.h\n"
- "fmla z24.h, p3/M, z1.h, z10.h\n"
- "fmla z25.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xe0]\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z19.h, p3/M, z7.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z23.h, p3/M, z4.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xf0]\n"
- "fmla z16.h, p3/M, z2.h, z10.h\n"
- "fmla z17.h, p3/M, z1.h, z10.h\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xf8]\n"
- "fmla z21.h, p3/M, z6.h, z11.h\n"
- "fmla z24.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z12.h\n"
- "ldr x14, [x16, #0x100]\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "fmla z19.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
- "ldr x13, [x16, #0x108]\n"
- "fmla z16.h, p3/M, z6.h, z10.h\n"
- "fmla z20.h, p3/M, z3.h, z10.h\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z2.h, z11.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z22.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ldr x12, [x16, #0x110]\n"
- "fmla z23.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z19.h, p3/M, z8.h, z12.h\n"
- "ldr x11, [x16, #0x118]\n"
- "fmla z27.h, p3/M, z2.h, z12.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
"fmla z23.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z25.h, p3/M, z7.h, z11.h\n"
- "fmla z26.h, p3/M, z6.h, z11.h\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z24.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z5.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z13.h }, p2/Z, [x27, x16, LSL #1]\n"
"fmla z29.h, p3/M, z7.h, z10.h\n"
"fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
"fmla z28.h, p3/M, z8.h, z10.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z7.h, z11.h\n"
- "fmla z27.h, p3/M, z6.h, z11.h\n"
- "fmla z29.h, p3/M, z5.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z11.h\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z16.h, p3/M, z4.h, z10.h\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z18.h, p3/M, z5.h, z11.h\n"
- "fmla z19.h, p3/M, z4.h, z11.h\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z30.h, p3/M, z7.h, z12.h\n"
- "fmla z31.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z13.h\n"
+ "fmla z19.h, p3/M, z2.h, z13.h\n"
+ "fmla z17.h, p3/M, z7.h, z9.h\n"
+ "fmla z18.h, p3/M, z6.h, z9.h\n"
+ "fmla z16.h, p3/M, z8.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "fmla z19.h, p3/M, z3.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z18.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z3.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "fmla z21.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z8.h, z13.h\n"
+ "fmla z18.h, p3/M, z7.h, z13.h\n"
+ "fmla z19.h, p3/M, z6.h, z13.h\n"
"fmla z22.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z13.h }, p2/Z, [x21, x16, LSL #1]\n"
"fmla z23.h, p3/M, z1.h, z11.h\n"
- "fmla z26.h, p3/M, z8.h, z10.h\n"
- "fmla z27.h, p3/M, z7.h, z10.h\n"
- ".inst 0xc16dc9f0 // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- ".inst 0xc16dc9f4 // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x20]\n"
- "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x28]\n"
- "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x30]\n"
- ".inst 0xc16dc9f8 // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
- "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x38]\n"
- "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x40]\n"
- ".inst 0xc16dc9fc // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
- "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x48]\n"
- "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x50]\n"
- "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x58]\n"
- "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
- "ldr x23, [x9, #0x60]\n"
- "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
- "ldr x22, [x9, #0x68]\n"
- "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
- "ldr x21, [x9, #0x70]\n"
- "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
- "ldr x20, [x9, #0x78]\n"
- "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
- "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
- "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
- "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z9.h\n"
+ ".inst 0xc16fc9d8 // fclamp { z24.h-z27.h }, z14.h, z15.h\n"
+ "fmla z31.h, p3/M, z7.h, z9.h\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z16.h, p3/M, z4.h, z13.h\n"
+ "fmla z17.h, p3/M, z3.h, z13.h\n"
+ ".inst 0xc16fc9d4 // fclamp { z20.h-z23.h }, z14.h, z15.h\n"
+ "fmla z18.h, p3/M, z5.h, z9.h\n"
+ "st1h { z24.h }, p0, [x12, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z19.h, p3/M, z4.h, z9.h\n"
+ "st1h { z25.h }, p0, [x11, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x28]\n"
+ ".inst 0xc16fc9dc // fclamp { z28.h-z31.h }, z14.h, z15.h\n"
+ "st1h { z26.h }, p0, [x10, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "st1h { z27.h }, p0, [x9, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "st1h { z20.h }, p0, [x23, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ ".inst 0xc16fc9d0 // fclamp { z16.h-z19.h }, z14.h, z15.h\n"
+ "st1h { z21.h }, p0, [x22, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "st1h { z22.h }, p0, [x21, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "st1h { z23.h }, p0, [x20, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "st1h { z28.h }, p0, [x23, x13, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "st1h { z29.h }, p0, [x22, x13, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "st1h { z30.h }, p0, [x21, x13, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1h { z31.h }, p0, [x20, x13, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "st1h { z16.h }, p0, [x23, x13, LSL #1]\n"
+ "st1h { z17.h }, p0, [x22, x13, LSL #1]\n"
+ "st1h { z18.h }, p0, [x21, x13, LSL #1]\n"
+ "st1h { z19.h }, p0, [x20, x13, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 27fcb2e6d2..eacad19f36 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
namespace arm_conv {
namespace depthwise {
@@ -65,3 +67,5 @@ class sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 066ce06aa6..6015161a4b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -102,58 +102,58 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
"mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"add x7, x4, x4\n"
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x8, x5, x21, LSL #1\n"
- "add x17, x7, x4\n"
- "add x16, x8, x21, LSL #1\n"
- "add x15, x17, x4\n"
- "add x14, x16, x21, LSL #1\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #1\n"
+ "add x15, x16, x21, LSL #1\n"
+ "add x14, x15, x21, LSL #1\n"
"add x13, x14, x21, LSL #1\n"
"cbnz x3, 2f\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"lsl x12, %x[n_channels], #0x1\n"
"mov x28, #0x8\n"
"mul x28, x28, x4\n"
- "add x27, x16, x7, LSL #1\n"
+ "add x27, x15, x7, LSL #1\n"
"add x26, x5, x4, LSL #1\n"
- "add x25, x5, x17, LSL #1\n"
- "sub x20, x24, x3\n"
- "add x24, x5, x15, LSL #1\n"
+ "add x25, x5, x8, LSL #1\n"
+ "sub x20, x20, x3\n"
+ "add x24, x5, x17, LSL #1\n"
"sub x20, x20, #0x1\n"
- "add x23, x8, x4, LSL #1\n"
+ "add x23, x16, x4, LSL #1\n"
"and x20, x20, #0x3fffff\n"
"add x22, x5, x7, LSL #1\n"
"orr x12, x12, x20, LSL #22\n"
- "add x21, x8, x17, LSL #1\n"
+ "add x21, x16, x8, LSL #1\n"
"orr x12, x12, x28, LSL #38\n"
- "add x20, x8, x15, LSL #1\n"
- "add x11, x8, x7, LSL #1\n"
+ "add x20, x16, x17, LSL #1\n"
+ "add x11, x16, x7, LSL #1\n"
"add x10, x14, x4, LSL #1\n"
- "add x9, x16, x4, LSL #1\n"
- "add x28, x14, x17, LSL #1\n"
+ "add x9, x15, x4, LSL #1\n"
+ "add x28, x14, x8, LSL #1\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
- "add x27, x16, x17, LSL #1\n"
+ "add x27, x15, x8, LSL #1\n"
".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
- "add x26, x14, x15, LSL #1\n"
+ "add x26, x14, x17, LSL #1\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
- "add x25, x16, x15, LSL #1\n"
+ "add x25, x15, x17, LSL #1\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
"add x24, x13, x4, LSL #1\n"
- ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
"add x23, x14, x7, LSL #1\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- "add x22, x13, x17, LSL #1\n"
+ "add x22, x13, x8, LSL #1\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
"add x21, x13, x7, LSL #1\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
- "add x20, x13, x15, LSL #1\n"
+ "add x20, x13, x17, LSL #1\n"
".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
@@ -167,199 +167,199 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x20, #0x2\n"
- "ld1h { z19.h }, p3/Z, [x6]\n"
+ "ld1h { z28.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x24\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x25\n"
".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "mul x22, x2, x26\n" // offset = tile_i * ld_output_row
- "cmp x24, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "madd x22, x3, x25, x22\n" // offset += tile_j * ld_output_col
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mul x22, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x25, %x[n_channels]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x3, x26, x22\n" // offset += tile_j * ld_output_col
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mov x21, #0x0\n"
"mul x22, x22, x20\n" // offset *= output_tile_size
- "sub x20, XZR, x24\n"
+ "sub x20, XZR, x25\n"
"ld1h { z8.h }, p3/Z, [x6]\n"
- "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "add x24, x24, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z9.h }, p2/Z, [x15, x7, LSL #1]\n"
"addvl x6, x6, #1\n"
- "add x22, x23, x26, LSL #1\n"
+ "add x23, x24, x23, LSL #1\n"
"ld1h { z10.h }, p2/Z, [x5]\n"
"ld1h { z11.h }, p2/Z, [x5, x4, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x5, x15, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x8]\n"
- "ld1h { z15.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x16]\n"
+ "ld1h { z15.h }, p2/Z, [x16, x4, LSL #1]\n"
"ld1h { z16.h }, p2/Z, [x5, x7, LSL #1]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "whilelt p1.h, x24, %x[n_channels]\n"
+ "movprfx z24, z28\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z25, z28\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "whilelt p1.h, x25, %x[n_channels]\n"
"inch x21\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z19.h }, p3/Z, [x6]\n"
+ "movprfx z26, z28\n fmla z26.h, p3/M, z2.h, z9.h\n"
+ "movprfx z27, z28\n fmla z27.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z28.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- "inch x24\n"
+ "inch x25\n"
"mov p0.b, p2.b\n"
"addvl x5, x5, #1\n"
"inch x20\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x16, x17, LSL #1]\n"
"ld1h { z10.h }, p1/Z, [x5]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
- "addvl x8, x8, #1\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14]\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x16]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x8, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x16, x7, LSL #1]\n"
"addvl x16, x16, #1\n"
- "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x5, x17, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z25.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x14]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z19.h }, p2/Z, [x15]\n"
+ "ld1h { z17.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z5.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, x8, LSL #1]\n"
+ "ld1h { z0.h }, p2/Z, [x15, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, x17, LSL #1]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z24.h, p3/M, z5.h, z22.h\n"
+ "fmla z25.h, p3/M, z3.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "ld1h { z9.h }, p1/Z, [x15, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z1.h, z0.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x13]\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "fmla z27.h, p3/M, z1.h, z18.h\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z0.h\n"
"ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z20.h\n"
"addvl x14, x14, #1\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x5, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x5, x8, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z13.h }, p1/Z, [x5, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z20.h\n"
+ "fmla z26.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x17, LSL #1]\n"
"whilelt p2.h, x21, %x[n_channels]\n"
- "cmp x24, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
"addvl x13, x13, #1\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "fmla z27.h, p3/M, z3.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
"ld1h { z16.h }, p1/Z, [x5, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "ld1h { z14.h }, p1/Z, [x8]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z27.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z14.h }, p1/Z, [x16]\n"
+ "fmla z27.h, p3/M, z6.h, z19.h\n"
".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "ld1h { z15.h }, p1/Z, [x8, x4, LSL #1]\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z15.h }, p1/Z, [x16, x4, LSL #1]\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
"ld1h { z11.h }, p1/Z, [x5, x4, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ ".inst 0xc17fcbd8 // fclamp { z24.h-z27.h }, z30.h, z31.h\n"
+ "st1h { z24.h }, p0, [x24]\n"
+ "st1h { z25.h }, p0, [x24, x26, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z26.h }, p0, [x23]\n"
+ "st1h { z27.h }, p0, [x23, x26, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "movprfx z24, z28\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z25, z28\n fmla z25.h, p3/M, z6.h, z9.h\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"mov p0.b, p2.b\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z26, z28\n fmla z26.h, p3/M, z2.h, z9.h\n"
+ "movprfx z27, z28\n fmla z27.h, p3/M, z0.h, z9.h\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"add x3, x3, #0x1\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z21.h }, p2/Z, [x16, x17, LSL #1]\n"
"add x20, x2, #0x1\n"
- "cmp x3, x24\n"
+ "cmp x3, x22\n"
"csel x2, x2, x20, LT\n"
"csel x3, x3, XZR, LT\n"
"cmp x2, x21\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14]\n"
- "fmla z29.h, p3/M, z0.h, z16.h\n"
- "fmla z28.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x16]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x13]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x16, x8, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z25.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x14]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z25.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z22.h }, p2/Z, [x15]\n"
+ "ld1h { z19.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z22.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, x8, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x15, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z20.h\n"
+ "fmla z25.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x13]\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "fmla z27.h, p3/M, z1.h, z18.h\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z21.h\n"
+ "fmla z27.h, p3/M, z5.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z20.h\n"
+ "fmla z27.h, p3/M, z2.h, z21.h\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z18.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z17.h\n"
+ "fmla z27.h, p3/M, z6.h, z19.h\n"
+ "fmla z27.h, p3/M, z8.h, z16.h\n"
+ ".inst 0xc17fcbd8 // fclamp { z24.h-z27.h }, z30.h, z31.h\n"
+ "st1h { z24.h }, p0, [x24]\n"
+ "st1h { z25.h }, p0, [x24, x26, LSL #1]\n"
+ "st1h { z26.h }, p0, [x23]\n"
+ "st1h { z27.h }, p0, [x23, x26, LSL #1]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
@@ -371,4 +371,4 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 1bf3a84959..ebbbd760fc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -96,24 +96,24 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
".inst 0x25207810 // ptrue pn8.b\n"
"cnth x13\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z24.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ldp x12, x11, [x20, #0x0]\n"
- "ldp x10, x9, [x20, #0x10]\n"
"cmp x13, %x[n_channels]\n"
- "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x28, XZR, x13\n"
- "ld1h { z17.h }, p3/Z, [x14]\n"
+ "ld1rh { z27.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x10, XZR, x13\n"
+ "ldp x9, x28, [x20, #0x10]\n"
+ "ld1h { z23.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
"ldp x27, x26, [x16, #0x0]\n"
- "ldp x25, x24, [x16, #0x10]\n"
".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "ldp x21, x20, [x16, #0x30]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
"ld1h { z8.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
+ "ldp x21, x20, [x16, #0x30]\n"
"ld1h { z9.h }, p2/Z, [x27, x15, LSL #1]\n"
"ld1h { z10.h }, p2/Z, [x26, x15, LSL #1]\n"
"ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
@@ -124,187 +124,187 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x16, #0x40]\n"
+ "movprfx z28, z23\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z23\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x25, [x16, #0x40]\n"
"whilelt p1.h, x13, %x[n_channels]\n"
- "ldr x26, [x16, #0x48]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "ld1h { z17.h }, p3/Z, [x14]\n"
- "ldr x25, [x16, #0x50]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "movprfx z30, z23\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z23\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z23.h }, p3/Z, [x14]\n"
+ "ldr x21, [x16, #0x50]\n"
"addvl x14, x14, #1\n"
- "inch x28\n"
- "ldr x24, [x16, #0x58]\n"
+ "inch x10\n"
+ "ldr x20, [x16, #0x58]\n"
"mov p0.b, p2.b\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
"fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x23, [x16, #0x60]\n"
- "ldr x22, [x16, #0x68]\n"
+ "ld1h { z20.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x26, [x16, #0x68]\n"
+ "ldr x23, [x16, #0x88]\n"
"fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
"fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x27, [x16, #0x80]\n"
- "ldr x26, [x16, #0x88]\n"
+ "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x90]\n"
"fmla z28.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
"fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x16, #0x98]\n"
- "ldr x25, [x16, #0x90]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z22.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z28.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x22, [x16, #0xa8]\n"
+ "fmla z29.h, p3/M, z4.h, z25.h\n"
+ "ld1h { z18.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z30.h, p3/M, z0.h, z18.h\n"
"fmla z28.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "ld1h { z15.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z19.h\n"
+ "fmla z29.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "fmla z30.h, p3/M, z1.h, z15.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z1.h, z25.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z15.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z20.h\n"
+ "fmla z31.h, p3/M, z5.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z17.h\n"
+ "fmla z31.h, p3/M, z2.h, z20.h\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldp x27, x26, [x16, #0x0]\n"
- "inch x15\n"
"ldp x25, x24, [x16, #0x10]\n"
- "whilelt p2.h, x15, %x[n_channels]\n"
+ "inch x15\n"
"ldp x23, x22, [x16, #0x20]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
"ldp x21, x20, [x16, #0x30]\n"
"ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
"fmla z31.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z8.h, z19.h\n"
"ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
"ld1h { z12.h }, p1/Z, [x24, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
"ld1h { z13.h }, p1/Z, [x23, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "ld1h { z14.h }, p1/Z, [x22, x13, LSL #1]\n"
"ld1h { z16.h }, p1/Z, [x20, x13, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z14.h }, p1/Z, [x22, x13, LSL #1]\n"
".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z6.h, z19.h\n"
"ld1h { z15.h }, p1/Z, [x21, x13, LSL #1]\n"
".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z8.h, z17.h\n"
"ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
"inch x13\n"
"cmp x13, %x[n_channels]\n"
"ld1h { z8.h }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- ".inst 0xc172ca7c // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
- "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ ".inst 0xc17bcb1c // fclamp { z28.h-z31.h }, z24.h, z27.h\n"
+ "st1h { z28.h }, p0, [x12, x10, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x10, LSL #1]\n"
+ "st1h { z30.h }, p0, [x9, x10, LSL #1]\n"
+ "st1h { z31.h }, p0, [x28, x10, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
- "ldr x27, [x16, #0x40]\n"
- "inch x28\n"
- "ldr x26, [x16, #0x48]\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z28, z23\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z23\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x25, [x16, #0x40]\n"
+ "inch x10\n"
+ "ldr x22, [x16, #0x48]\n"
+ "movprfx z30, z23\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z23\n fmla z31.h, p3/M, z0.h, z9.h\n"
"mov p0.b, p2.b\n"
- "ldr x25, [x16, #0x50]\n"
- "ldr x24, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.h, p3/M, z0.h, z10.h\n"
"fmla z29.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x23, [x16, #0x60]\n"
- "ldr x22, [x16, #0x68]\n"
+ "ld1h { z21.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x26, [x16, #0x68]\n"
+ "ldr x23, [x16, #0x88]\n"
"fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
"fmla z29.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x27, [x16, #0x80]\n"
- "ldr x26, [x16, #0x88]\n"
+ "ld1h { z18.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x90]\n"
"fmla z28.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
"fmla z29.h, p3/M, z0.h, z16.h\n"
- "ldr x24, [x16, #0x98]\n"
- "ldr x25, [x16, #0x90]\n"
- "fmla z30.h, p3/M, z3.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z28.h, p3/M, z4.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z29.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x22, [x16, #0xa8]\n"
+ "fmla z29.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z22.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ld1h { z19.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z30.h, p3/M, z0.h, z22.h\n"
"fmla z28.h, p3/M, z2.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "ld1h { z17.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x21, [x16, #0xb0]\n"
- "fmla z30.h, p3/M, z0.h, z15.h\n"
- "fmla z29.h, p3/M, z5.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xc0]\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z20.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x15, LSL #1]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p3/M, z4.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z16.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "fmla z30.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "fmla z31.h, p3/M, z1.h, z18.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z21.h\n"
+ "fmla z31.h, p3/M, z5.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z20.h\n"
+ "fmla z31.h, p3/M, z2.h, z21.h\n"
+ "fmla z30.h, p3/M, z5.h, z18.h\n"
"ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z6.h, z15.h\n"
- "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z11.h\n"
- "fmla z31.h, p3/M, z5.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z3.h, z16.h\n"
- "fmla z30.h, p3/M, z8.h, z15.h\n"
- "fmla z31.h, p3/M, z7.h, z14.h\n"
- "fmla z31.h, p3/M, z6.h, z15.h\n"
- "fmla z31.h, p3/M, z8.h, z11.h\n"
- ".inst 0xc172ca7c // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
- "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z18.h\n"
+ "fmla z30.h, p3/M, z8.h, z19.h\n"
+ "fmla z31.h, p3/M, z7.h, z17.h\n"
+ "fmla z31.h, p3/M, z6.h, z19.h\n"
+ "fmla z31.h, p3/M, z8.h, z16.h\n"
+ ".inst 0xc17bcb1c // fclamp { z28.h-z31.h }, z24.h, z27.h\n"
+ "st1h { z28.h }, p0, [x12, x10, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x10, LSL #1]\n"
+ "st1h { z30.h }, p0, [x9, x10, LSL #1]\n"
+ "st1h { z31.h }, p0, [x28, x10, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
@@ -315,4 +315,4 @@ void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 84263cb564..e6864ba2c3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,12 +22,14 @@
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "utils.hpp"
#include <cstdint>
#pragma once
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
namespace arm_conv {
namespace depthwise {
@@ -65,3 +67,5 @@ class sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirs
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 58b7824b98..96231dc1ab 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -102,81 +102,81 @@ void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
"mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
- "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
"add x7, x4, x4\n"
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x8, x5, x21, LSL #1\n"
- "add x17, x7, x4\n"
- "add x16, x8, x21, LSL #1\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #1\n"
"add x15, x17, x4\n"
"add x14, x16, x21, LSL #1\n"
- "add x13, x15, x4\n"
- "add x12, x14, x21, LSL #1\n"
+ "add x13, x14, x21, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
"add x11, x12, x21, LSL #1\n"
"cbnz x3, 2f\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"lsl x10, %x[n_channels], #0x1\n"
"mov x21, #0x4\n"
"mul x21, x21, x4\n"
"add x9, x5, x4, LSL #1\n"
- "add x28, x8, x4, LSL #1\n"
+ "add x28, x16, x4, LSL #1\n"
"add x27, x5, x7, LSL #1\n"
- "sub x20, x25, x3\n"
- "add x26, x8, x7, LSL #1\n"
+ "sub x20, x20, x3\n"
+ "add x26, x16, x7, LSL #1\n"
"sub x20, x20, #0x1\n"
- "add x25, x5, x17, LSL #1\n"
+ "add x25, x5, x8, LSL #1\n"
"and x20, x20, #0x3fffff\n"
- "add x24, x5, x15, LSL #1\n"
+ "add x24, x5, x17, LSL #1\n"
"orr x10, x10, x20, LSL #22\n"
- "add x23, x8, x13, LSL #1\n"
+ "add x23, x16, x15, LSL #1\n"
"orr x10, x10, x21, LSL #38\n"
- "add x22, x8, x17, LSL #1\n"
- "add x21, x8, x15, LSL #1\n"
- "add x20, x5, x13, LSL #1\n"
+ "add x22, x16, x8, LSL #1\n"
+ "add x21, x16, x17, LSL #1\n"
+ "add x20, x5, x15, LSL #1\n"
".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- "add x9, x16, x4, LSL #1\n"
- ".inst 0xf8aa491a // rprfm pldonce, x10, [x8]\n"
+ "add x9, x14, x4, LSL #1\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
- "add x28, x16, x7, LSL #1\n"
+ "add x28, x14, x7, LSL #1\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x16, x17, LSL #1\n"
+ "add x27, x14, x8, LSL #1\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x16, x15, LSL #1\n"
+ "add x26, x14, x17, LSL #1\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x16, x13, LSL #1\n"
+ "add x25, x14, x15, LSL #1\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
- "add x24, x14, x4, LSL #1\n"
+ "add x24, x13, x4, LSL #1\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
- "add x23, x14, x7, LSL #1\n"
- ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
+ "add x23, x13, x7, LSL #1\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x14, x17, LSL #1\n"
+ "add x22, x13, x8, LSL #1\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x14, x15, LSL #1\n"
+ "add x21, x13, x17, LSL #1\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x14, x13, LSL #1\n"
+ "add x20, x13, x15, LSL #1\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
"add x9, x12, x4, LSL #1\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
"add x28, x12, x7, LSL #1\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x12, x17, LSL #1\n"
+ "add x27, x12, x8, LSL #1\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x12, x15, LSL #1\n"
+ "add x26, x12, x17, LSL #1\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x12, x13, LSL #1\n"
- ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
+ "add x25, x12, x15, LSL #1\n"
+ ".inst 0xf8aa49ba // rprfm pldonce, x10, [x13]\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
"add x24, x11, x4, LSL #1\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
"add x23, x11, x7, LSL #1\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x11, x17, LSL #1\n"
+ "add x22, x11, x8, LSL #1\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x11, x15, LSL #1\n"
+ "add x21, x11, x17, LSL #1\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x11, x13, LSL #1\n"
+ "add x20, x11, x15, LSL #1\n"
".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
@@ -191,387 +191,387 @@ void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
"ldr x27, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mov x26, #0x2\n"
- "cnth x25\n"
- "ld1h { z18.h }, p3/Z, [x6]\n"
- "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mov x23, #0x2\n"
+ "cnth x26\n"
+ "ld1h { z31.h }, p3/Z, [x6]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"addvl x6, x6, #1\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "cmp x25, %x[n_channels]\n"
+ "cmp x26, %x[n_channels]\n"
"mul x22, x2, x27\n" // offset = tile_i * ld_output_row
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z27.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mov x21, #0x0\n"
- "madd x22, x3, x24, x22\n" // offset += tile_j * ld_output_col
- "sub x20, XZR, x25\n"
+ "madd x22, x3, x25, x22\n" // offset += tile_j * ld_output_col
+ "sub x20, XZR, x26\n"
"ld1h { z4.h }, p3/Z, [x6]\n"
- "mul x22, x22, x26\n" // offset *= output_tile_size
+ "mul x22, x22, x23\n" // offset *= output_tile_size
"ld1h { z5.h }, p2/Z, [x5]\n"
"addvl x6, x6, #1\n"
- "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x24, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
"ld1h { z6.h }, p2/Z, [x5, x4, LSL #1]\n"
- "add x22, x23, x27, LSL #1\n"
- "ld1h { z7.h }, p2/Z, [x8]\n"
- "ld1h { z8.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "add x23, x24, x27, LSL #1\n"
+ "ld1h { z7.h }, p2/Z, [x16]\n"
+ "ld1h { z8.h }, p2/Z, [x16, x4, LSL #1]\n"
"ld1h { z9.h }, p2/Z, [x5, x7, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x5, x17, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x5, x15, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x8, x13, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x16]\n"
+ "ld1h { z13.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x5, x8, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x14]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
- "whilelt p1.h, x25, %x[n_channels]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x6]\n"
+ "movprfx z20, z31\n fmla z20.h, p3/M, z0.h, z5.h\n"
+ "movprfx z21, z31\n fmla z21.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z30.h }, p2/Z, [x16, x8, LSL #1]\n"
+ "whilelt p1.h, x26, %x[n_channels]\n"
+ "movprfx z22, z31\n fmla z22.h, p3/M, z0.h, z7.h\n"
+ "movprfx z23, z31\n fmla z23.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x6]\n"
"inch x21\n"
- "inch x25\n"
+ "inch x26\n"
"mov p0.b, p2.b\n"
"inch x20\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
- "addvl x8, x8, #1\n"
- "fmla z29.h, p3/M, z1.h, z9.h\n"
- "fmla z30.h, p3/M, z1.h, z8.h\n"
- "fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
- "addvl x5, x5, #1\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "ld1h { z7.h }, p1/Z, [x8]\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z1.h, z6.h\n"
+ "fmla z21.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z5.h }, p2/Z, [x16, x17, LSL #1]\n"
"addvl x16, x16, #1\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
- "addvl x6, x6, #16\n"
- "ld1h { z18.h }, p3/Z, [x6, #4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x14]\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z1.h, z8.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z24.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z20.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z22.h, p3/M, z2.h, z13.h\n"
+ "fmla z23.h, p3/M, z2.h, z30.h\n"
+ "ld1h { z16.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z20.h, p3/M, z3.h, z11.h\n"
+ "fmla z21.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z30.h\n"
+ "fmla z23.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z29.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z20.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z28.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z5.h\n"
+ "fmla z23.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z25.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "ld1h { z3.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z20.h, p3/M, z18.h, z7.h\n"
+ "fmla z21.h, p3/M, z18.h, z8.h\n"
+ "ld1h { z7.h }, p1/Z, [x16]\n"
+ "fmla z22.h, p3/M, z18.h, z14.h\n"
+ "fmla z23.h, p3/M, z18.h, z11.h\n"
+ "ld1h { z19.h }, p3/Z, [x6, #5, MUL VL]\n"
+ "fmla z20.h, p3/M, z24.h, z8.h\n"
+ "fmla z21.h, p3/M, z24.h, z13.h\n"
+ "ld1h { z26.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z24.h, z11.h\n"
+ "fmla z23.h, p3/M, z24.h, z28.h\n"
+ "ld1h { z18.h }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmla z20.h, p3/M, z16.h, z13.h\n"
+ "fmla z21.h, p3/M, z16.h, z30.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
"addvl x14, x14, #1\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x12]\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x6]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+ "fmla z22.h, p3/M, z16.h, z28.h\n"
+ "fmla z23.h, p3/M, z16.h, z25.h\n"
+ "ld1h { z9.h }, p3/Z, [x6, #7, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "ld1h { z31.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z20.h, p3/M, z29.h, z30.h\n"
+ "fmla z21.h, p3/M, z29.h, z5.h\n"
+ "ld1h { z12.h }, p2/Z, [x13]\n"
+ "fmla z22.h, p3/M, z29.h, z25.h\n"
+ "fmla z23.h, p3/M, z29.h, z17.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #-8, MUL VL]\n"
+ "fmla z20.h, p3/M, z3.h, z5.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z2.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z23.h, p3/M, z3.h, z26.h\n"
+ "ld1h { z1.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "ld1h { z16.h }, p3/Z, [x6, #-7, MUL VL]\n"
+ "fmla z20.h, p3/M, z19.h, z14.h\n"
+ "fmla z21.h, p3/M, z19.h, z11.h\n"
+ "ld1h { z0.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z19.h, z12.h\n"
+ "fmla z23.h, p3/M, z19.h, z2.h\n"
+ "ld1h { z30.h }, p3/Z, [x6, #-6, MUL VL]\n"
+ "fmla z20.h, p3/M, z18.h, z11.h\n"
+ "fmla z21.h, p3/M, z18.h, z28.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z22.h, p3/M, z18.h, z2.h\n"
+ "fmla z23.h, p3/M, z18.h, z1.h\n"
+ "ld1h { z24.h }, p3/Z, [x6, #-5, MUL VL]\n"
+ "fmla z20.h, p3/M, z9.h, z28.h\n"
+ "fmla z21.h, p3/M, z9.h, z25.h\n"
+ "ld1h { z28.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z22.h, p3/M, z9.h, z1.h\n"
+ "fmla z23.h, p3/M, z9.h, z29.h\n"
+ "ld1h { z18.h }, p3/Z, [x6, #-4, MUL VL]\n"
+ "fmla z20.h, p3/M, z4.h, z25.h\n"
+ "fmla z21.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x12]\n"
+ "fmla z22.h, p3/M, z4.h, z29.h\n"
+ "fmla z23.h, p3/M, z4.h, z28.h\n"
+ "ld1h { z19.h }, p3/Z, [x6, #-3, MUL VL]\n"
+ "fmla z20.h, p3/M, z16.h, z17.h\n"
+ "fmla z21.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z16.h, z28.h\n"
+ "fmla z23.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z26.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "ld1h { z16.h }, p3/Z, [x6, #-2, MUL VL]\n"
+ "fmla z20.h, p3/M, z30.h, z12.h\n"
+ "fmla z21.h, p3/M, z30.h, z2.h\n"
+ "ld1h { z6.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z30.h, z25.h\n"
+ "fmla z23.h, p3/M, z30.h, z17.h\n"
+ "ld1h { z11.h }, p3/Z, [x6, #-1, MUL VL]\n"
+ "fmla z20.h, p3/M, z24.h, z2.h\n"
+ "fmla z21.h, p3/M, z24.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "fmla z22.h, p3/M, z24.h, z17.h\n"
+ "fmla z23.h, p3/M, z24.h, z6.h\n"
+ "ld1h { z12.h }, p3/Z, [x6]\n"
+ "fmla z20.h, p3/M, z18.h, z1.h\n"
+ "fmla z21.h, p3/M, z18.h, z29.h\n"
+ "ld1h { z30.h }, p2/Z, [x12, x15, LSL #1]\n"
"addvl x12, x12, #1\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
+ "fmla z22.h, p3/M, z18.h, z6.h\n"
+ "fmla z23.h, p3/M, z18.h, z8.h\n"
+ "ld1h { z24.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z20.h, p3/M, z19.h, z29.h\n"
+ "fmla z21.h, p3/M, z19.h, z28.h\n"
+ "ld1h { z18.h }, p2/Z, [x11]\n"
+ "fmla z22.h, p3/M, z19.h, z8.h\n"
+ "fmla z23.h, p3/M, z19.h, z26.h\n"
"ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "ld1h { z14.h }, p1/Z, [x16]\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z20.h, p3/M, z16.h, z28.h\n"
+ "fmla z21.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z28.h }, p2/Z, [x11, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z16.h, z26.h\n"
+ "fmla z23.h, p3/M, z16.h, z30.h\n"
+ "ld1h { z19.h }, p3/Z, [x6, #3, MUL VL]\n"
"addvl x6, x6, #5\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z14.h }, p1/Z, [x14]\n"
+ "fmla z20.h, p3/M, z11.h, z25.h\n"
+ "fmla z21.h, p3/M, z11.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z11.h, z18.h\n"
+ "fmla z23.h, p3/M, z11.h, z28.h\n"
+ "ld1h { z18.h }, p2/Z, [x11, x8, LSL #1]\n"
+ "fmla z20.h, p3/M, z12.h, z17.h\n"
+ "fmla z21.h, p3/M, z12.h, z6.h\n"
+ "ld1h { z13.h }, p1/Z, [x16, x7, LSL #1]\n"
+ "fmla z22.h, p3/M, z12.h, z28.h\n"
+ "fmla z23.h, p3/M, z12.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z20.h, p3/M, z24.h, z6.h\n"
+ "fmla z21.h, p3/M, z24.h, z8.h\n"
"ld1h { z5.h }, p1/Z, [x5]\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+ "fmla z22.h, p3/M, z24.h, z16.h\n"
+ "fmla z23.h, p3/M, z24.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x15, LSL #1]\n"
"whilelt p2.h, x21, %x[n_channels]\n"
- "cmp x25, %x[n_channels]\n"
+ "cmp x26, %x[n_channels]\n"
"addvl x11, x11, #1\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "fmla z20.h, p3/M, z3.h, z8.h\n"
+ "fmla z21.h, p3/M, z3.h, z26.h\n"
"ld1h { z6.h }, p1/Z, [x5, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p1/Z, [x5, x17, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z23.h, p3/M, z3.h, z17.h\n"
".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
"addvl x6, x6, #4\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p1/Z, [x8, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p1/Z, [x8, x13, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p1/Z, [x5, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z11.h }, p1/Z, [x5, x8, LSL #1]\n"
+ "fmla z20.h, p3/M, z19.h, z26.h\n"
+ "fmla z21.h, p3/M, z19.h, z30.h\n"
+ "ld1h { z8.h }, p1/Z, [x16, x4, LSL #1]\n"
+ "fmla z22.h, p3/M, z19.h, z17.h\n"
+ "fmla z23.h, p3/M, z19.h, z16.h\n"
"ld1h { z9.h }, p1/Z, [x5, x7, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x5, x17, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x16, x15, LSL #1]\n"
"ld1h { z4.h }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+ ".inst 0xc17bc9f4 // fclamp { z20.h-z23.h }, z15.h, z27.h\n"
+ "st1h { z20.h }, p0, [x24]\n"
+ "st1h { z21.h }, p0, [x24, x25, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
- "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
- "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "movprfx z28, z31\n fmla z28.h, p3/M, z0.h, z5.h\n"
+ "movprfx z29, z31\n fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z23.h }, p2/Z, [x16, x8, LSL #1]\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
- "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
- "ld1h { z0.h }, p3/Z, [x6]\n"
+ "movprfx z30, z31\n fmla z30.h, p3/M, z0.h, z7.h\n"
+ "fmla z31.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z19.h }, p3/Z, [x6]\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"mov p0.b, p2.b\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"add x3, x3, #0x1\n"
"fmla z28.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
"fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z22.h }, p2/Z, [x16, x17, LSL #1]\n"
"add x20, x2, #0x1\n"
"fmla z30.h, p3/M, z1.h, z8.h\n"
"fmla z31.h, p3/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
- "cmp x3, x25\n"
+ "ld1h { z21.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "cmp x3, x22\n"
"csel x2, x2, x20, LT\n"
"csel x3, x3, XZR, LT\n"
"cmp x2, x21\n"
"fmla z28.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
"fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x5, x15, LSL #1]\n"
"fmla z30.h, p3/M, z2.h, z13.h\n"
- "fmla z31.h, p3/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z31.h, p3/M, z2.h, z23.h\n"
+ "ld1h { z16.h }, p3/Z, [x6, #2, MUL VL]\n"
"fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
"fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z5.h\n"
- "fmla z31.h, p3/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "ld1h { z0.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "fmla z31.h, p3/M, z3.h, z22.h\n"
+ "ld1h { z17.h }, p3/Z, [x6, #3, MUL VL]\n"
"fmla z28.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z5.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z22.h\n"
"fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z7.h\n"
- "fmla z29.h, p3/M, z0.h, z8.h\n"
- "fmla z30.h, p3/M, z0.h, z14.h\n"
- "fmla z31.h, p3/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z13.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z5.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "ld1h { z20.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z28.h, p3/M, z19.h, z7.h\n"
+ "fmla z29.h, p3/M, z19.h, z8.h\n"
+ "fmla z30.h, p3/M, z19.h, z14.h\n"
+ "fmla z31.h, p3/M, z19.h, z0.h\n"
+ "ld1h { z19.h }, p3/Z, [x6, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z21.h, z8.h\n"
+ "fmla z29.h, p3/M, z21.h, z13.h\n"
+ "ld1h { z26.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z21.h, z0.h\n"
+ "fmla z31.h, p3/M, z21.h, z5.h\n"
+ "ld1h { z18.h }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z16.h, z13.h\n"
+ "fmla z29.h, p3/M, z16.h, z23.h\n"
+ "ld1h { z25.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z16.h, z5.h\n"
+ "fmla z31.h, p3/M, z16.h, z3.h\n"
+ "ld1h { z16.h }, p3/Z, [x6, #7, MUL VL]\n"
"addvl x6, x6, #16\n"
- "fmla z28.h, p3/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x14]\n"
- "fmla z29.h, p3/M, z3.h, z6.h\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z31.h, p3/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z13.h\n"
- "fmla z31.h, p3/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z11.h\n"
- "fmla z30.h, p3/M, z0.h, z5.h\n"
- "fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z12.h\n"
- "fmla z30.h, p3/M, z1.h, z6.h\n"
- "fmla z31.h, p3/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z9.h\n"
- "fmla z30.h, p3/M, z2.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x12]\n"
- "fmla z29.h, p3/M, z3.h, z13.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "fmla z28.h, p3/M, z17.h, z23.h\n"
+ "fmla z29.h, p3/M, z17.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x13]\n"
+ "fmla z30.h, p3/M, z17.h, z3.h\n"
+ "fmla z31.h, p3/M, z17.h, z25.h\n"
+ "ld1h { z17.h }, p3/Z, [x6, #-8, MUL VL]\n"
+ "fmla z28.h, p3/M, z20.h, z22.h\n"
+ "fmla z29.h, p3/M, z20.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z20.h, z25.h\n"
+ "fmla z31.h, p3/M, z20.h, z26.h\n"
+ "ld1h { z2.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x6, #-7, MUL VL]\n"
+ "fmla z28.h, p3/M, z19.h, z14.h\n"
+ "fmla z29.h, p3/M, z19.h, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z19.h, z24.h\n"
+ "fmla z31.h, p3/M, z19.h, z23.h\n"
+ "ld1h { z21.h }, p3/Z, [x6, #-6, MUL VL]\n"
+ "fmla z28.h, p3/M, z18.h, z0.h\n"
+ "fmla z29.h, p3/M, z18.h, z5.h\n"
+ "ld1h { z0.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z18.h, z23.h\n"
+ "fmla z31.h, p3/M, z18.h, z2.h\n"
+ "ld1h { z20.h }, p3/Z, [x6, #-5, MUL VL]\n"
+ "fmla z28.h, p3/M, z16.h, z5.h\n"
+ "fmla z29.h, p3/M, z16.h, z3.h\n"
+ "ld1h { z19.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z16.h, z2.h\n"
+ "fmla z31.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x6, #-4, MUL VL]\n"
+ "fmla z28.h, p3/M, z17.h, z3.h\n"
+ "fmla z29.h, p3/M, z17.h, z25.h\n"
+ "ld1h { z16.h }, p2/Z, [x12]\n"
+ "fmla z30.h, p3/M, z17.h, z0.h\n"
+ "fmla z31.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p3/Z, [x6, #-3, MUL VL]\n"
+ "fmla z28.h, p3/M, z22.h, z25.h\n"
+ "fmla z29.h, p3/M, z22.h, z26.h\n"
+ "ld1h { z7.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z22.h, z19.h\n"
+ "fmla z31.h, p3/M, z22.h, z1.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x17, LSL #1]\n"
"ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z6.h\n"
- "fmla z30.h, p3/M, z0.h, z9.h\n"
- "fmla z31.h, p3/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
- "fmla z28.h, p3/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "fmla z31.h, p3/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p3/Z, [x6]\n"
- "fmla z28.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z2.h, z5.h\n"
- "fmla z31.h, p3/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
- "fmla z28.h, p3/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11]\n"
- "fmla z29.h, p3/M, z3.h, z12.h\n"
- "fmla z30.h, p3/M, z3.h, z6.h\n"
- "fmla z31.h, p3/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z14.h\n"
- "fmla z30.h, p3/M, z4.h, z8.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
- "fmla z28.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z0.h, z13.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z12.h\n"
- "fmla z28.h, p3/M, z1.h, z13.h\n"
- "fmla z29.h, p3/M, z1.h, z5.h\n"
- "fmla z30.h, p3/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z31.h, p3/M, z1.h, z9.h\n"
- "fmla z28.h, p3/M, z2.h, z5.h\n"
- "fmla z29.h, p3/M, z2.h, z6.h\n"
- "fmla z30.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "fmla z28.h, p3/M, z3.h, z6.h\n"
- "fmla z29.h, p3/M, z3.h, z8.h\n"
- "fmla z30.h, p3/M, z3.h, z11.h\n"
- "fmla z31.h, p3/M, z3.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z8.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z21.h, z24.h\n"
+ "fmla z29.h, p3/M, z21.h, z23.h\n"
+ "ld1h { z26.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z21.h, z16.h\n"
+ "fmla z31.h, p3/M, z21.h, z7.h\n"
+ "ld1h { z25.h }, p3/Z, [x6, #-1, MUL VL]\n"
+ "fmla z28.h, p3/M, z20.h, z23.h\n"
+ "fmla z29.h, p3/M, z20.h, z2.h\n"
+ "ld1h { z24.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z20.h, z7.h\n"
+ "fmla z31.h, p3/M, z20.h, z26.h\n"
+ "ld1h { z23.h }, p3/Z, [x6]\n"
+ "fmla z28.h, p3/M, z18.h, z2.h\n"
+ "fmla z29.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z22.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z18.h, z26.h\n"
+ "fmla z31.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z17.h, z0.h\n"
+ "fmla z29.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z18.h }, p2/Z, [x11]\n"
+ "fmla z30.h, p3/M, z17.h, z24.h\n"
+ "fmla z31.h, p3/M, z17.h, z12.h\n"
+ "ld1h { z20.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z19.h\n"
+ "fmla z29.h, p3/M, z4.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x4, LSL #1]\n"
"fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z9.h\n"
- ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
- "st1h { z28.h }, p0, [x23]\n"
- "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
- "st1h { z30.h }, p0, [x22]\n"
- "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z25.h, z16.h\n"
+ "fmla z29.h, p3/M, z25.h, z7.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z25.h, z18.h\n"
+ "fmla z31.h, p3/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x11, x8, LSL #1]\n"
+ "fmla z28.h, p3/M, z23.h, z7.h\n"
+ "fmla z29.h, p3/M, z23.h, z26.h\n"
+ "fmla z30.h, p3/M, z23.h, z17.h\n"
+ "fmla z31.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z21.h, z26.h\n"
+ "fmla z29.h, p3/M, z21.h, z24.h\n"
+ "fmla z30.h, p3/M, z21.h, z16.h\n"
+ "fmla z31.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z20.h, z24.h\n"
+ "fmla z29.h, p3/M, z20.h, z12.h\n"
+ "fmla z30.h, p3/M, z20.h, z18.h\n"
+ "fmla z31.h, p3/M, z20.h, z17.h\n"
+ "fmla z28.h, p3/M, z19.h, z12.h\n"
+ "fmla z29.h, p3/M, z19.h, z22.h\n"
+ "fmla z30.h, p3/M, z19.h, z17.h\n"
+ "fmla z31.h, p3/M, z19.h, z16.h\n"
+ ".inst 0xc17bc9fc // fclamp { z28.h-z31.h }, z15.h, z27.h\n"
+ "st1h { z28.h }, p0, [x24]\n"
+ "st1h { z29.h }, p0, [x24, x25, LSL #1]\n"
+ "st1h { z30.h }, p0, [x23]\n"
+ "st1h { z31.h }, p0, [x23, x25, LSL #1]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
@@ -583,4 +583,4 @@ void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 313036876e..e76f92e3cf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -99,439 +99,439 @@ void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "mov x15, #0x0\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mov x16, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
"whilelt p3.h, XZR, %x[n_channels]\n"
"ptrue p2.b\n"
- "cnth x13\n"
- "ldp x12, x11, [x20, #0x0]\n"
+ "cnth x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "sub x11, XZR, x14\n"
"ldp x10, x9, [x20, #0x10]\n"
- "cmp x13, %x[n_channels]\n"
- "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "sub x28, XZR, x13\n"
- "ldp x27, x26, [x16, #0x0]\n"
- "ld1h { z17.h }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "ldp x25, x24, [x16, #0x10]\n"
- ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "ld1rh { z16.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z5.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "ld1h { z6.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldp x27, x26, [x16, #0x40]\n"
- "ld1h { z4.h }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "ld1h { z7.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
- "ld1h { z13.h }, p3/Z, [x22, x15, LSL #1]\n"
- "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
- "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ld1h { z14.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ ".inst 0xa040a1e0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ld1rh { z27.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "ld1h { z4.h }, p2/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ld1h { z5.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ld1h { z6.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "ld1h { z7.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x20, x16, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x25, [x16, #0x50]\n"
- "whilelt p1.h, x13, %x[n_channels]\n"
- "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x24, [x16, #0x58]\n"
- "ld1h { z0.h }, p2/Z, [x14]\n"
- "ldr x23, [x16, #0x60]\n"
- "inch x28\n"
+ "movprfx z28, z26\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z26\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x21, [x17, #0x50]\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "movprfx z30, z26\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z26\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ "ld1h { z0.h }, p2/Z, [x15]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "inch x11\n"
"mov p0.b, p3.b\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
+ "ld1h { z17.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x25, [x17, #0x68]\n"
"fmla z28.h, p2/M, z1.h, z6.h\n"
"fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x21, [x16, #0x70]\n"
+ "ld1h { z24.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x24, [x17, #0x70]\n"
"fmla z30.h, p2/M, z1.h, z8.h\n"
"fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x27, [x16, #0x80]\n"
+ "ld1h { z26.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "ldr x20, [x17, #0x88]\n"
"fmla z28.h, p2/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
"fmla z29.h, p2/M, z2.h, z11.h\n"
- "ldr x26, [x16, #0x88]\n"
+ "ld1h { z16.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x90]\n"
"fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
- "ldr x25, [x16, #0x90]\n"
- "ldr x24, [x16, #0x98]\n"
+ "fmla z31.h, p2/M, z2.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
"fmla z28.h, p2/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
"fmla z29.h, p2/M, z3.h, z12.h\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
+ "ld1h { z22.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z30.h, p2/M, z3.h, z17.h\n"
+ "fmla z31.h, p2/M, z3.h, z24.h\n"
+ "ld1h { z5.h }, p2/Z, [x15, #3, MUL VL]\n"
"fmla z28.h, p2/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z16.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z30.h, p2/M, z4.h, z24.h\n"
"fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1h { z18.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xb8]\n"
+ "ld1h { z16.h }, p2/Z, [x15, #4, MUL VL]\n"
"fmla z28.h, p2/M, z0.h, z7.h\n"
"fmla z29.h, p2/M, z0.h, z8.h\n"
"fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xc8]\n"
+ "fmla z31.h, p2/M, z0.h, z22.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z28.h, p2/M, z26.h, z8.h\n"
+ "fmla z29.h, p2/M, z26.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z30.h, p2/M, z26.h, z22.h\n"
+ "fmla z31.h, p2/M, z26.h, z19.h\n"
+ "ld1h { z9.h }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z28.h, p2/M, z23.h, z13.h\n"
+ "fmla z29.h, p2/M, z23.h, z17.h\n"
+ "ld1h { z6.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xc0]\n"
+ "fmla z30.h, p2/M, z23.h, z19.h\n"
+ "fmla z31.h, p2/M, z23.h, z18.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "ld1h { z26.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z5.h, z17.h\n"
+ "fmla z29.h, p2/M, z5.h, z24.h\n"
+ "ld1h { z4.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xd0]\n"
+ "fmla z30.h, p2/M, z5.h, z18.h\n"
+ "fmla z31.h, p2/M, z5.h, z6.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z28.h, p2/M, z16.h, z24.h\n"
+ "fmla z29.h, p2/M, z16.h, z10.h\n"
+ "ld1h { z0.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z30.h, p2/M, z16.h, z6.h\n"
+ "fmla z31.h, p2/M, z16.h, z1.h\n"
+ "ld1h { z25.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xe0]\n"
+ "ld1h { z16.h }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z28.h, p2/M, z20.h, z14.h\n"
+ "fmla z29.h, p2/M, z20.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z30.h, p2/M, z20.h, z4.h\n"
+ "fmla z31.h, p2/M, z20.h, z0.h\n"
+ "ld1h { z23.h }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z28.h, p2/M, z9.h, z22.h\n"
+ "fmla z29.h, p2/M, z9.h, z19.h\n"
+ "ld1h { z3.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z30.h, p2/M, z9.h, z0.h\n"
+ "fmla z31.h, p2/M, z9.h, z25.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z28.h, p2/M, z21.h, z19.h\n"
+ "fmla z29.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z22.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z30.h, p2/M, z21.h, z25.h\n"
+ "fmla z31.h, p2/M, z21.h, z3.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z28.h, p2/M, z17.h, z18.h\n"
+ "fmla z29.h, p2/M, z17.h, z6.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x100]\n"
+ "fmla z30.h, p2/M, z17.h, z3.h\n"
+ "fmla z31.h, p2/M, z17.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z28.h, p2/M, z16.h, z6.h\n"
+ "fmla z29.h, p2/M, z16.h, z1.h\n"
+ "ld1h { z5.h }, p3/Z, [x28, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z30.h, p2/M, z16.h, z22.h\n"
+ "fmla z31.h, p2/M, z16.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z28.h, p2/M, z23.h, z4.h\n"
+ "fmla z29.h, p2/M, z23.h, z0.h\n"
+ "ld1h { z13.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x110]\n"
+ "fmla z30.h, p2/M, z23.h, z18.h\n"
+ "fmla z31.h, p2/M, z23.h, z5.h\n"
+ "ld1h { z9.h }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z28.h, p2/M, z20.h, z0.h\n"
+ "fmla z29.h, p2/M, z20.h, z25.h\n"
+ "ld1h { z23.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x118]\n"
+ "fmla z30.h, p2/M, z20.h, z5.h\n"
+ "fmla z31.h, p2/M, z20.h, z13.h\n"
+ "ld1h { z1.h }, p2/Z, [x15]\n"
+ "fmla z28.h, p2/M, z19.h, z25.h\n"
+ "fmla z29.h, p2/M, z19.h, z3.h\n"
+ "ld1h { z14.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z19.h, z13.h\n"
+ "fmla z31.h, p2/M, z19.h, z23.h\n"
+ "ld1h { z7.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z17.h, z3.h\n"
+ "fmla z29.h, p2/M, z17.h, z22.h\n"
+ "ld1h { z0.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z17.h, z23.h\n"
+ "fmla z31.h, p2/M, z17.h, z21.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z16.h, z22.h\n"
+ "fmla z29.h, p2/M, z16.h, z24.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z16.h, z21.h\n"
+ "fmla z31.h, p2/M, z16.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "fmla z28.h, p2/M, z9.h, z18.h\n"
+ "fmla z29.h, p2/M, z9.h, z5.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z9.h, z0.h\n"
+ "fmla z31.h, p2/M, z9.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ "fmla z28.h, p2/M, z1.h, z5.h\n"
"fmla z29.h, p2/M, z1.h, z13.h\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xc0]\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
- "ld1h { z17.h }, p2/Z, [x14, #4, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xd0]\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xd8]\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0x100]\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0x108]\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0x110]\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0x118]\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x14]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldp x27, x26, [x16, #0x0]\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
- "fmla z31.h, p2/M, z1.h, z9.h\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "ld1h { z5.h }, p1/Z, [x27, x13, LSL #1]\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
- "fmla z30.h, p2/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldp x25, x24, [x16, #0x10]\n"
- "ldp x23, x22, [x16, #0x20]\n"
- "inch x15\n"
- "ldp x21, x20, [x16, #0x30]\n"
- "whilelt p3.h, x15, %x[n_channels]\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "ld1h { z6.h }, p1/Z, [x26, x13, LSL #1]\n"
- "ldp x27, x26, [x16, #0x40]\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "ld1h { z7.h }, p1/Z, [x25, x13, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p1/Z, [x24, x13, LSL #1]\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n"
- "ld1h { z12.h }, p1/Z, [x20, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p1/Z, [x23, x13, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x27, x13, LSL #1]\n"
- "ld1h { z14.h }, p1/Z, [x26, x13, LSL #1]\n"
- "inch x13\n"
- ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
- "addvl x14, x14, #4\n"
- "cmp x13, %x[n_channels]\n"
- ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
- "ld1h { z4.h }, p2/Z, [x14]\n"
- "addvl x14, x14, #1\n"
- "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ "fmla z30.h, p2/M, z1.h, z17.h\n"
+ "fmla z31.h, p2/M, z1.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z7.h, z13.h\n"
+ "fmla z29.h, p2/M, z7.h, z23.h\n"
+ "ld1h { z5.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "fmla z30.h, p2/M, z7.h, z16.h\n"
+ "fmla z31.h, p2/M, z7.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "inch x16\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "whilelt p3.h, x16, %x[n_channels]\n"
+ "fmla z28.h, p2/M, z20.h, z23.h\n"
+ "fmla z29.h, p2/M, z20.h, z21.h\n"
+ "ld1h { z6.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "fmla z30.h, p2/M, z20.h, z18.h\n"
+ "fmla z31.h, p2/M, z20.h, z17.h\n"
+ "ld1h { z7.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "fmla z28.h, p2/M, z19.h, z21.h\n"
+ "fmla z29.h, p2/M, z19.h, z14.h\n"
+ "ld1h { z8.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "fmla z30.h, p2/M, z19.h, z17.h\n"
+ "fmla z31.h, p2/M, z19.h, z16.h\n"
+ "ld1h { z9.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "inch x14\n"
+ ".inst 0xa040a1e0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "cmp x14, %x[n_channels]\n"
+ ".inst 0xc17bc9fc // fclamp { z28.h-z31.h }, z15.h, z27.h\n"
+ "ld1h { z4.h }, p2/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "st1h { z28.h }, p0, [x13, x11, LSL #1]\n"
+ "st1h { z29.h }, p0, [x12, x11, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x11, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x11, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
- "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
- "ldr x25, [x16, #0x50]\n"
- "inch x28\n"
- "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
- "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
- "ldr x24, [x16, #0x58]\n"
- "ld1h { z0.h }, p2/Z, [x14]\n"
- "ldr x23, [x16, #0x60]\n"
+ "movprfx z28, z26\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z26\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x22, [x17, #0x50]\n"
+ "inch x11\n"
+ "movprfx z30, z26\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z26\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x21, [x17, #0x58]\n"
+ "ld1h { z19.h }, p2/Z, [x15]\n"
+ "ldr x20, [x17, #0x60]\n"
"mov p0.b, p3.b\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x22, [x16, #0x68]\n"
+ "ld1h { z23.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x25, [x17, #0x68]\n"
"fmla z28.h, p2/M, z1.h, z6.h\n"
"fmla z29.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x21, [x16, #0x70]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x24, [x17, #0x70]\n"
"fmla z30.h, p2/M, z1.h, z8.h\n"
"fmla z31.h, p2/M, z1.h, z13.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
- "ldr x20, [x16, #0x78]\n"
- "ldr x27, [x16, #0x80]\n"
+ "ld1h { z21.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "ldr x22, [x17, #0x88]\n"
"fmla z28.h, p2/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
"fmla z29.h, p2/M, z2.h, z11.h\n"
- "ldr x26, [x16, #0x88]\n"
+ "ld1h { z18.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x90]\n"
"fmla z30.h, p2/M, z2.h, z13.h\n"
- "fmla z31.h, p2/M, z2.h, z5.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
- "ldr x25, [x16, #0x90]\n"
- "ldr x24, [x16, #0x98]\n"
- "fmla z28.h, p2/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z3.h, z12.h\n"
- "ldr x23, [x16, #0xa0]\n"
- "fmla z30.h, p2/M, z3.h, z5.h\n"
- "fmla z31.h, p2/M, z3.h, z6.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
- "ldr x22, [x16, #0xa8]\n"
- "fmla z28.h, p2/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
- "ldr x21, [x16, #0xb0]\n"
- "fmla z29.h, p2/M, z4.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p2/M, z4.h, z6.h\n"
- "fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z7.h\n"
- "fmla z29.h, p2/M, z0.h, z8.h\n"
- "fmla z30.h, p2/M, z0.h, z14.h\n"
- "fmla z31.h, p2/M, z0.h, z11.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z8.h\n"
- "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0xc8]\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "fmla z30.h, p2/M, z1.h, z11.h\n"
- "fmla z31.h, p2/M, z1.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z13.h\n"
- "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0xc0]\n"
- "fmla z29.h, p2/M, z2.h, z5.h\n"
- "fmla z30.h, p2/M, z2.h, z12.h\n"
- "fmla z31.h, p2/M, z2.h, z9.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
- "fmla z28.h, p2/M, z3.h, z5.h\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0xd0]\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "fmla z30.h, p2/M, z3.h, z9.h\n"
- "fmla z31.h, p2/M, z3.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z6.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0xd8]\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
- "ldr x23, [x16, #0xe0]\n"
- "fmla z30.h, p2/M, z4.h, z13.h\n"
- "fmla z31.h, p2/M, z4.h, z8.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z14.h\n"
- "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
- "ldr x20, [x16, #0xf8]\n"
- "fmla z29.h, p2/M, z0.h, z11.h\n"
- "fmla z30.h, p2/M, z0.h, z5.h\n"
- "fmla z31.h, p2/M, z0.h, z6.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
- "ldr x22, [x16, #0xe8]\n"
- "fmla z29.h, p2/M, z1.h, z12.h\n"
- "fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z10.h\n"
- "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
- "fmla z28.h, p2/M, z2.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
- "ldr x21, [x16, #0xf0]\n"
- "fmla z29.h, p2/M, z2.h, z9.h\n"
- "fmla z30.h, p2/M, z2.h, z10.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
- "fmla z28.h, p2/M, z3.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
- "ldr x27, [x16, #0x100]\n"
- "fmla z29.h, p2/M, z3.h, z13.h\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
- "fmla z28.h, p2/M, z4.h, z13.h\n"
- "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
- "ldr x26, [x16, #0x108]\n"
- "fmla z29.h, p2/M, z4.h, z8.h\n"
- "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z14.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z5.h\n"
- "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x16, #0x110]\n"
- "fmla z29.h, p2/M, z0.h, z6.h\n"
- "fmla z30.h, p2/M, z0.h, z9.h\n"
- "fmla z31.h, p2/M, z0.h, z13.h\n"
- "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
- "fmla z28.h, p2/M, z1.h, z6.h\n"
- "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
- "ldr x24, [x16, #0x118]\n"
- "fmla z29.h, p2/M, z1.h, z10.h\n"
- "fmla z30.h, p2/M, z1.h, z13.h\n"
- "fmla z31.h, p2/M, z1.h, z5.h\n"
- "ld1h { z1.h }, p2/Z, [x14]\n"
- "fmla z28.h, p2/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z2.h, z11.h\n"
- "fmla z30.h, p2/M, z2.h, z5.h\n"
- "fmla z31.h, p2/M, z2.h, z6.h\n"
- "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+ "fmla z31.h, p2/M, z2.h, z23.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "ldr x20, [x17, #0xa0]\n"
"fmla z28.h, p2/M, z3.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
"fmla z29.h, p2/M, z3.h, z12.h\n"
- "fmla z30.h, p2/M, z3.h, z6.h\n"
- "fmla z31.h, p2/M, z3.h, z8.h\n"
- "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z0.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z30.h, p2/M, z3.h, z23.h\n"
+ "fmla z31.h, p2/M, z3.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, #3, MUL VL]\n"
"fmla z28.h, p2/M, z4.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z4.h, z14.h\n"
- "fmla z30.h, p2/M, z4.h, z8.h\n"
+ "fmla z29.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z5.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z30.h, p2/M, z4.h, z22.h\n"
"fmla z31.h, p2/M, z4.h, z10.h\n"
- "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
- "fmla z28.h, p2/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
- "fmla z29.h, p2/M, z0.h, z13.h\n"
- "fmla z30.h, p2/M, z0.h, z11.h\n"
- "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
- "fmla z31.h, p2/M, z0.h, z12.h\n"
- "fmla z28.h, p2/M, z1.h, z13.h\n"
- "fmla z29.h, p2/M, z1.h, z5.h\n"
- "fmla z30.h, p2/M, z1.h, z12.h\n"
- "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
- "fmla z31.h, p2/M, z1.h, z9.h\n"
- "fmla z28.h, p2/M, z2.h, z5.h\n"
- "fmla z29.h, p2/M, z2.h, z6.h\n"
- "fmla z30.h, p2/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "fmla z28.h, p2/M, z3.h, z6.h\n"
- "fmla z29.h, p2/M, z3.h, z8.h\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "fmla z28.h, p2/M, z4.h, z8.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z9.h\n"
- ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
- "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ "ld1h { z3.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "ld1h { z20.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z19.h, z7.h\n"
+ "fmla z29.h, p2/M, z19.h, z8.h\n"
+ "fmla z30.h, p2/M, z19.h, z14.h\n"
+ "fmla z31.h, p2/M, z19.h, z0.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z28.h, p2/M, z21.h, z8.h\n"
+ "fmla z29.h, p2/M, z21.h, z13.h\n"
+ "ld1h { z26.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z30.h, p2/M, z21.h, z0.h\n"
+ "fmla z31.h, p2/M, z21.h, z5.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z28.h, p2/M, z16.h, z13.h\n"
+ "fmla z29.h, p2/M, z16.h, z23.h\n"
+ "ld1h { z25.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z30.h, p2/M, z16.h, z5.h\n"
+ "fmla z31.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z28.h, p2/M, z17.h, z23.h\n"
+ "fmla z29.h, p2/M, z17.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z30.h, p2/M, z17.h, z3.h\n"
+ "fmla z31.h, p2/M, z17.h, z25.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z28.h, p2/M, z20.h, z22.h\n"
+ "fmla z29.h, p2/M, z20.h, z10.h\n"
+ "ld1h { z23.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z30.h, p2/M, z20.h, z25.h\n"
+ "fmla z31.h, p2/M, z20.h, z26.h\n"
+ "ld1h { z2.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xe0]\n"
+ "ld1h { z22.h }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z28.h, p2/M, z19.h, z14.h\n"
+ "fmla z29.h, p2/M, z19.h, z0.h\n"
+ "ld1h { z1.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z30.h, p2/M, z19.h, z24.h\n"
+ "fmla z31.h, p2/M, z19.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z28.h, p2/M, z18.h, z0.h\n"
+ "fmla z29.h, p2/M, z18.h, z5.h\n"
+ "ld1h { z0.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z30.h, p2/M, z18.h, z23.h\n"
+ "fmla z31.h, p2/M, z18.h, z2.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z28.h, p2/M, z16.h, z5.h\n"
+ "fmla z29.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z30.h, p2/M, z16.h, z2.h\n"
+ "fmla z31.h, p2/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z28.h, p2/M, z17.h, z3.h\n"
+ "fmla z29.h, p2/M, z17.h, z25.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z30.h, p2/M, z17.h, z0.h\n"
+ "fmla z31.h, p2/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z28.h, p2/M, z22.h, z25.h\n"
+ "fmla z29.h, p2/M, z22.h, z26.h\n"
+ "ld1h { z7.h }, p3/Z, [x28, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z30.h, p2/M, z22.h, z19.h\n"
+ "fmla z31.h, p2/M, z22.h, z1.h\n"
+ "ld1h { z9.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ld1h { z4.h }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z28.h, p2/M, z21.h, z24.h\n"
+ "fmla z29.h, p2/M, z21.h, z23.h\n"
+ "ld1h { z26.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z30.h, p2/M, z21.h, z16.h\n"
+ "fmla z31.h, p2/M, z21.h, z7.h\n"
+ "ld1h { z25.h }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z28.h, p2/M, z20.h, z23.h\n"
+ "fmla z29.h, p2/M, z20.h, z2.h\n"
+ "ld1h { z24.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z30.h, p2/M, z20.h, z7.h\n"
+ "fmla z31.h, p2/M, z20.h, z26.h\n"
+ "ld1h { z23.h }, p2/Z, [x15]\n"
+ "fmla z28.h, p2/M, z18.h, z2.h\n"
+ "fmla z29.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z18.h, z26.h\n"
+ "fmla z31.h, p2/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z17.h, z0.h\n"
+ "fmla z29.h, p2/M, z17.h, z19.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z17.h, z24.h\n"
+ "fmla z31.h, p2/M, z17.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z19.h\n"
+ "fmla z29.h, p2/M, z4.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z9.h\n"
+ "fmla z31.h, p2/M, z4.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z28.h, p2/M, z25.h, z16.h\n"
+ "fmla z29.h, p2/M, z25.h, z7.h\n"
+ "ld1h { z16.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p2/M, z25.h, z18.h\n"
+ "fmla z31.h, p2/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z23.h, z7.h\n"
+ "fmla z29.h, p2/M, z23.h, z26.h\n"
+ "fmla z30.h, p2/M, z23.h, z17.h\n"
+ "fmla z31.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z21.h, z26.h\n"
+ "fmla z29.h, p2/M, z21.h, z24.h\n"
+ "fmla z30.h, p2/M, z21.h, z16.h\n"
+ "fmla z31.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z20.h, z24.h\n"
+ "fmla z29.h, p2/M, z20.h, z9.h\n"
+ "fmla z30.h, p2/M, z20.h, z18.h\n"
+ "fmla z31.h, p2/M, z20.h, z17.h\n"
+ "fmla z28.h, p2/M, z19.h, z9.h\n"
+ "fmla z29.h, p2/M, z19.h, z22.h\n"
+ "fmla z30.h, p2/M, z19.h, z17.h\n"
+ "fmla z31.h, p2/M, z19.h, z16.h\n"
+ ".inst 0xc17bc9fc // fclamp { z28.h-z31.h }, z15.h, z27.h\n"
+ "st1h { z28.h }, p0, [x13, x11, LSL #1]\n"
+ "st1h { z29.h }, p0, [x12, x11, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x11, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x11, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME2) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 96cfd5e497..e0c7d71e61 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,240 +88,240 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ptrue p3.b\n"
- ".inst 0x25207810 // ptrue pn8.b\n"
"mov x4, #0x0\n"
"mov x5, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
"1:" // Tile loop
"str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x22, #0x2\n"
"str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x20, x4, x21\n" // offset = tile_i * ld_input_row
"ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x4, x21\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x5, x6, x20\n" // offset += tile_j * ld_input_col
+ "add x17, x6, x6\n"
"mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x16, x17, x6\n"
"add x7, x7, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x8, x7, x21, LSL #2\n"
- "add x17, x8, x21, LSL #2\n"
- "add x16, x6, x6\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x14, x17, x21, LSL #2\n"
- "add x13, x16, x6\n"
+ "add x15, x7, x21, LSL #2\n"
+ "add x14, x15, x21, LSL #2\n"
+ "add x13, x14, x21, LSL #2\n"
"cbnz x5, 2f\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x21, x20, x5\n"
- "sub x21, x21, #0x1\n"
"lsl x12, %x[n_channels], #0x2\n"
- "mov x20, #0x8\n"
- "and x21, x21, #0x3fffff\n"
- "mul x20, x20, x6\n"
- "orr x12, x12, x21, LSL #22\n"
- "orr x12, x12, x20, LSL #38\n"
- "add x11, x8, x6, LSL #2\n"
- "add x10, x7, x13, LSL #2\n"
- "add x9, x8, x16, LSL #2\n"
- "add x28, x17, x6, LSL #2\n"
- "add x27, x14, x13, LSL #2\n"
+ "mov x21, #0x8\n"
+ "mul x21, x21, x6\n"
+ "add x11, x15, x6, LSL #2\n"
+ "add x10, x7, x16, LSL #2\n"
+ "add x9, x15, x17, LSL #2\n"
+ "sub x20, x20, x5\n"
+ "add x28, x14, x6, LSL #2\n"
+ "sub x20, x20, #0x1\n"
+ "add x27, x13, x16, LSL #2\n"
+ "and x20, x20, #0x3fffff\n"
"add x26, x7, x6, LSL #2\n"
- "add x25, x7, x16, LSL #2\n"
- "add x24, x17, x16, LSL #2\n"
- "add x23, x8, x13, LSL #2\n"
- "add x22, x17, x13, LSL #2\n"
- "add x21, x14, x6, LSL #2\n"
- "add x20, x14, x16, LSL #2\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x25, x7, x17, LSL #2\n"
+ "orr x12, x12, x21, LSL #38\n"
+ "add x24, x14, x17, LSL #2\n"
+ "add x23, x15, x16, LSL #2\n"
+ "add x22, x14, x16, LSL #2\n"
+ "add x21, x13, x6, LSL #2\n"
+ "add x20, x13, x17, LSL #2\n"
".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
- ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
- ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
- ".inst 0xf8ac4a3a // rprfm pldonce, x12, [x17]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x4, x22\n" // offset = tile_i * ld_output_row
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x20, #0x2\n"
- "ld1w { z22.s }, p3/Z, [x15]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x21, x5, x25, x21\n" // offset += tile_j * ld_output_col
- "addvl x15, x15, #1\n"
- ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "ld1w { z22.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x25\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mul x21, x21, x20\n" // offset *= output_tile_size
- "cntw x23\n"
- "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "addvl x15, x15, #4\n"
- "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "addvl x15, x15, #4\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x23, %x[n_channels]\n"
- "add x22, x24, x22, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x15]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "mul x22, x4, x23\n" // offset = tile_i * ld_output_row
+ "cmp x25, %x[n_channels]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x5, x26, x22\n" // offset += tile_j * ld_output_col
+ "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "ld1w { z9.s }, p2/Z, [x8, x6, LSL #2]\n"
+ "mul x22, x22, x20\n" // offset *= output_tile_size
+ "sub x20, XZR, x25\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "add x24, x24, x22, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z9.s }, p2/Z, [x15, x6, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "add x23, x24, x23, LSL #2\n"
"ld1w { z10.s }, p2/Z, [x7]\n"
- "addvl x15, x15, #1\n"
- "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x8, x16, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x15, x17, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x14, x6, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "whilelt p1.s, x23, %x[n_channels]\n"
- "incw x21\n"
- "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x14]\n"
- "incw x23\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
- "mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
- "incw x20\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z18.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
- "addvl x7, x7, #1\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "ld1w { z22.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z17.s\n"
- "ld1w { z9.s }, p2/Z, [x8]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z0.s, z11.s\n"
- "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
- "addvl x8, x8, #1\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "ld1w { z18.s }, p2/Z, [x17]\n"
- "fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z19.s\n"
- "fmla z28.s, p3/M, z8.s, z16.s\n"
- "fmla z29.s, p3/M, z7.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
- "addvl x17, x17, #1\n"
- "fmla z30.s, p3/M, z3.s, z18.s\n"
- "fmla z31.s, p3/M, z5.s, z17.s\n"
- "ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z9.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z19.s\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "fmla z29.s, p3/M, z8.s, z17.s\n"
- ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
- "addvl x15, x15, #4\n"
- "fmla z30.s, p3/M, z8.s, z16.s\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- "addvl x15, x15, #4\n"
- "cmp x23, %x[n_channels]\n"
- ".inst 0xc1aecabc // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
- "addvl x14, x14, #1\n"
- "ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x7]\n"
- "st1w { z28.s }, p0, [x24]\n"
- "ld1w { z11.s }, p1/Z, [x7, x13, LSL #2]\n"
- "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
- "addvl x24, x24, #1\n"
- "ld1w { z12.s }, p1/Z, [x8, x16, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
- "addvl x22, x22, #1\n"
- "ld1w { z8.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
- "blt 3b\n"
- "4:" // Tile loop: Channel tail
"movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
"movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
- "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x5, x5, #0x1\n"
+ "whilelt p1.s, x25, %x[n_channels]\n"
+ "incw x21\n"
"movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
"movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
- "ld1w { z17.s }, p2/Z, [x14]\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ld1w { z17.s }, p2/Z, [x13]\n"
+ "incw x25\n"
+ "ld1w { z22.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "mov p0.b, p2.b\n"
+ "incw x20\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ld1w { z16.s }, p2/Z, [x13, x16, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z12.s\n"
"fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
- "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "ld1w { z18.s }, p2/Z, [x14, x17, LSL #2]\n"
"fmla z24.s, p3/M, z5.s, z12.s\n"
"fmla z25.s, p3/M, z4.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
- "cmp x5, x20\n"
+ "ld1w { z28.s }, p2/Z, [x7, x6, LSL #2]\n"
"fmla z26.s, p3/M, z6.s, z17.s\n"
"fmla z27.s, p3/M, z3.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
- "add x20, x4, #0x1\n"
+ "ld1w { z14.s }, p2/Z, [x7, x17, LSL #2]\n"
+ "addvl x7, x7, #1\n"
"fmla z24.s, p3/M, z7.s, z13.s\n"
"fmla z25.s, p3/M, z6.s, z13.s\n"
- "csel x4, x4, x20, LT\n"
- "mov p0.b, p2.b\n"
"fmla z26.s, p3/M, z4.s, z13.s\n"
"fmla z27.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x8]\n"
- "csel x5, x5, XZR, LT\n"
- "fmla z24.s, p3/M, z1.s, z18.s\n"
- "fmla z25.s, p3/M, z0.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
- "cmp x4, x21\n"
- "fmla z26.s, p3/M, z5.s, z20.s\n"
- "fmla z27.s, p3/M, z4.s, z20.s\n"
- "fmla z24.s, p3/M, z2.s, z17.s\n"
- "fmla z25.s, p3/M, z1.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x17]\n"
- "fmla z26.s, p3/M, z0.s, z16.s\n"
- "fmla z27.s, p3/M, z2.s, z19.s\n"
- "fmla z24.s, p3/M, z8.s, z20.s\n"
- "fmla z25.s, p3/M, z7.s, z20.s\n"
- "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z18.s\n"
- "fmla z27.s, p3/M, z5.s, z17.s\n"
- "fmla z24.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
- "fmla z25.s, p3/M, z5.s, z19.s\n"
- "fmla z26.s, p3/M, z7.s, z16.s\n"
- "fmla z27.s, p3/M, z6.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x15]\n"
+ "fmla z24.s, p3/M, z1.s, z28.s\n"
+ "fmla z25.s, p3/M, z0.s, z28.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x16, LSL #2]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z14.s\n"
+ "fmla z25.s, p3/M, z1.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z0.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z13.s }, p1/Z, [x14, x6, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x6, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "addvl x13, x13, #1\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z27.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p1/Z, [x7, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x15, x6, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x7]\n"
"fmla z26.s, p3/M, z8.s, z16.s\n"
"fmla z27.s, p3/M, z7.s, z16.s\n"
- ".inst 0xc1aecab8 // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ld1w { z12.s }, p1/Z, [x15, x17, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ ".inst 0xc1b5c9f8 // fclamp { z24.s-z27.s }, z15.s, z21.s\n"
"st1w { z24.s }, p0, [x24]\n"
- "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
- "st1w { z26.s }, p0, [x22]\n"
- "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
+ "st1w { z25.s }, p0, [x24, x26, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z26.s }, p0, [x23]\n"
+ "st1w { z27.s }, p0, [x23, x26, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x13]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x5, x5, #0x1\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "add x20, x4, #0x1\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "cmp x5, x22\n"
+ "csel x4, x4, x20, LT\n"
+ "csel x5, x5, XZR, LT\n"
+ "cmp x4, x21\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x15]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z20.s\n"
+ "fmla z31.s, p3/M, z4.s, z20.s\n"
+ "fmla z28.s, p3/M, z2.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x14]\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z20.s\n"
+ "fmla z29.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z19.s\n"
+ "fmla z31.s, p3/M, z5.s, z18.s\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x6, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1b5c9fc // fclamp { z28.s-z31.s }, z15.s, z21.s\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x24, x26, LSL #2]\n"
+ "st1w { z30.s }, p0, [x23]\n"
+ "st1w { z31.s }, p0, [x23, x26, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 39f1b3635f..7ad83b779b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,194 +80,194 @@ void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ptrue p3.b\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
"ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z23.s }, p3/Z, [x14]\n"
+ "ldr x24, [x16, #0x20]\n"
+ "cntw x13\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x10, XZR, x13\n"
+ "ldp x9, x28, [x20, #0x10]\n"
+ "ld1w { z20.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ldp x13, x12, [x20, #0x0]\n"
- "cntw x11\n"
+ "ldp x23, x22, [x16, #0x0]\n"
".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "ldp x10, x9, [x20, #0x10]\n"
- "mov x28, #0x0\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
- "ldp x24, x23, [x15, #0x0]\n"
"addvl x14, x14, #4\n"
- "cmp x11, %x[n_channels]\n"
- "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x27, XZR, x11\n"
- "ldr x20, [x15, #0x20]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
- "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x24, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x15, #0x28]\n"
- "whilelt p1.s, x11, %x[n_channels]\n"
- "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x30]\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x21, [x15, #0x38]\n"
- "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x15, #0x48]\n"
- "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z12.s\n"
- "fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x20, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z19.s\n"
- "fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x21, [x15, #0x50]\n"
- "fmla z28.s, p3/M, z7.s, z13.s\n"
- "fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x20, [x15, #0x58]\n"
- "ld1w { z23.s }, p3/Z, [x14]\n"
- "fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z18.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x21, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z16.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x68]\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "fmla z31.s, p3/M, z4.s, z17.s\n"
- "ldr x26, [x15, #0x70]\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x16, #0x28]\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z1.s, z9.s\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ld1w { z20.s }, p3/Z, [x14]\n"
+ "ldr x24, [x16, #0x38]\n"
"addvl x14, x14, #1\n"
- "fmla z28.s, p3/M, z2.s, z25.s\n"
- "fmla z29.s, p3/M, z1.s, z25.s\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x25, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z2.s, z19.s\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "incw x27\n"
- "fmla z28.s, p3/M, z8.s, z17.s\n"
- "fmla z29.s, p3/M, z7.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "fmla z30.s, p3/M, z3.s, z18.s\n"
- "fmla z31.s, p3/M, z5.s, z17.s\n"
- "ldr x20, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "incw x10\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x48]\n"
"mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
- "incw x28\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "fmla z29.s, p3/M, z8.s, z17.s\n"
- "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
- "whilelt p2.s, x28, %x[n_channels]\n"
- "fmla z30.s, p3/M, z8.s, z16.s\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
- ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
- "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
- "incw x11\n"
- "cmp x11, %x[n_channels]\n"
- "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x16, #0x40]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ldr x22, [x16, #0x50]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "ldr x20, [x16, #0x60]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z28.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x27, [x16, #0x68]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z14.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x78]\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z4.s, z13.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z28.s\n"
+ "fmla z25.s, p3/M, z0.s, z28.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z14.s\n"
+ "fmla z25.s, p3/M, z1.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmla z26.s, p3/M, z0.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z13.s }, p1/Z, [x20, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "incw x15\n"
".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z27.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p1/Z, [x22, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x13, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x23, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z12.s }, p1/Z, [x21, x13, LSL #2]\n"
+ "incw x13\n"
".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
"addvl x14, x14, #4\n"
- "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+ "cmp x13, %x[n_channels]\n"
"ld1w { z8.s }, p3/Z, [x14]\n"
"addvl x14, x14, #1\n"
+ ".inst 0xc1afcad8 // fclamp { z24.s-z27.s }, z22.s, z15.s\n"
+ "st1w { z24.s }, p0, [x12, x10, LSL #2]\n"
+ "st1w { z25.s }, p0, [x11, x10, LSL #2]\n"
+ "st1w { z26.s }, p0, [x9, x10, LSL #2]\n"
+ "st1w { z27.s }, p0, [x28, x10, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
- "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x15, #0x28]\n"
- "incw x27\n"
- "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x30]\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x16, #0x28]\n"
+ "incw x10\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "ldr x22, [x16, #0x38]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x16, #0x48]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ldr x21, [x15, #0x38]\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x15, #0x48]\n"
- "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ldr x23, [x16, #0x60]\n"
"fmla z28.s, p3/M, z5.s, z12.s\n"
"fmla z29.s, p3/M, z4.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x20, [x15, #0x40]\n"
- "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
"fmla z31.s, p3/M, z3.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x50]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x78]\n"
"fmla z28.s, p3/M, z7.s, z13.s\n"
"fmla z29.s, p3/M, z6.s, z13.s\n"
- "ldr x21, [x15, #0x58]\n"
- "mov p0.b, p2.b\n"
"fmla z30.s, p3/M, z4.s, z13.s\n"
- "fmla z31.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x60]\n"
- "fmla z28.s, p3/M, z1.s, z18.s\n"
- "fmla z29.s, p3/M, z0.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
- "ldr x22, [x15, #0x68]\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
"fmla z30.s, p3/M, z5.s, z20.s\n"
"fmla z31.s, p3/M, z4.s, z20.s\n"
- "ldr x21, [x15, #0x70]\n"
- "fmla z28.s, p3/M, z2.s, z17.s\n"
- "fmla z29.s, p3/M, z1.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
- "fmla z30.s, p3/M, z0.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z2.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
"fmla z28.s, p3/M, z8.s, z20.s\n"
"fmla z29.s, p3/M, z7.s, z20.s\n"
- "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z18.s\n"
- "fmla z31.s, p3/M, z5.s, z17.s\n"
- "fmla z28.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
- "fmla z29.s, p3/M, z5.s, z19.s\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z31.s, p3/M, z6.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z19.s\n"
+ "fmla z31.s, p3/M, z5.s, z18.s\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
"fmla z30.s, p3/M, z8.s, z16.s\n"
"fmla z31.s, p3/M, z7.s, z16.s\n"
".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
- "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
- "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
- "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x12, x10, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x10, LSL #2]\n"
+ "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x10, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index d15a3a8377..cbb8d893d5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,72 +88,72 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ptrue p3.b\n"
- ".inst 0x25207810 // ptrue pn8.b\n"
"mov x2, #0x0\n"
"mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x22, #0x3\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "add x7, x4, x4\n"
"mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x21, LSL #2\n"
- "add x7, x6, x21, LSL #2\n"
- "add x8, x4, x4\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x21, LSL #2\n"
- "add x15, x8, x4\n"
- "add x14, x16, x21, LSL #2\n"
- "add x13, x15, x4\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #2\n"
+ "add x15, x16, x21, LSL #2\n"
+ "add x14, x15, x21, LSL #2\n"
+ "add x13, x14, x21, LSL #2\n"
"cbnz x3, 2f\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x21, x20, x3\n"
- "sub x21, x21, #0x1\n"
"lsl x12, %x[n_channels], #0x2\n"
- "mov x20, #0xc\n"
- "and x21, x21, #0x3fffff\n"
- "mul x20, x20, x4\n"
- "orr x12, x12, x21, LSL #22\n"
- "orr x12, x12, x20, LSL #38\n"
- "add x27, x7, x8, LSL #2\n"
- "add x26, x5, x13, LSL #2\n"
- "add x25, x6, x8, LSL #2\n"
- "add x24, x14, x13, LSL #2\n"
- "add x23, x7, x4, LSL #2\n"
+ "mov x28, #0xc\n"
+ "mul x28, x28, x4\n"
+ "add x27, x15, x7, LSL #2\n"
+ "add x26, x5, x17, LSL #2\n"
+ "add x25, x16, x7, LSL #2\n"
+ "sub x20, x20, x3\n"
+ "add x24, x13, x17, LSL #2\n"
+ "sub x20, x20, #0x1\n"
+ "add x23, x15, x4, LSL #2\n"
+ "and x20, x20, #0x3fffff\n"
"add x22, x5, x4, LSL #2\n"
- "add x21, x5, x15, LSL #2\n"
- "add x20, x7, x15, LSL #2\n"
- "add x11, x6, x13, LSL #2\n"
- "add x10, x16, x8, LSL #2\n"
- "add x9, x16, x13, LSL #2\n"
- "add x28, x14, x4, LSL #2\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x21, x5, x8, LSL #2\n"
+ "orr x12, x12, x28, LSL #38\n"
+ "add x20, x15, x8, LSL #2\n"
+ "add x11, x16, x17, LSL #2\n"
+ "add x10, x14, x7, LSL #2\n"
+ "add x9, x14, x17, LSL #2\n"
+ "add x28, x13, x4, LSL #2\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
- "add x27, x6, x4, LSL #2\n"
+ "add x27, x16, x4, LSL #2\n"
".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
- "add x26, x6, x15, LSL #2\n"
- ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ "add x26, x16, x8, LSL #2\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
- "add x25, x14, x15, LSL #2\n"
+ "add x25, x13, x8, LSL #2\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
- "add x24, x16, x4, LSL #2\n"
+ "add x24, x14, x4, LSL #2\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
- "add x23, x5, x8, LSL #2\n"
+ "add x23, x5, x7, LSL #2\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- "add x22, x16, x15, LSL #2\n"
+ "add x22, x14, x8, LSL #2\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
- "add x21, x7, x13, LSL #2\n"
+ "add x21, x15, x17, LSL #2\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
- "add x20, x14, x8, LSL #2\n"
- ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
- ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ "add x20, x13, x7, LSL #2\n"
".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
@@ -163,312 +163,312 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
- "mov x20, #0x3\n"
- "ld1w { z24.s }, p3/Z, [x17]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x21, #0x3\n"
+ "ld1w { z25.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
"ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x21, x3, x27, x21\n" // offset += tile_j * ld_output_col
- "mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntw x22\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
"ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "addvl x17, x17, #1\n"
- "add x26, x26, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "cntw x25\n"
- "addvl x17, x17, #4\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "add x24, x26, x22, LSL #2\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x20, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x22, %x[n_channels]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x20, x3, x27, x20\n" // offset += tile_j * ld_output_col
+ "add x25, x27, x27\n"
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "addvl x17, x17, #4\n"
- "cmp x25, %x[n_channels]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "add x23, x24, x22, LSL #2\n"
- "add x22, x27, x27\n"
- "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+ "mul x20, x20, x21\n" // offset *= output_tile_size
"mov x21, #0x0\n"
- "sub x20, XZR, x25\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "add x26, x26, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "sub x20, XZR, x22\n"
+ "ld1w { z9.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "add x24, x26, x23, LSL #2\n"
"ld1w { z10.s }, p2/Z, [x5]\n"
- "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
- "addvl x17, x17, #1\n"
- "ld1w { z12.s }, p2/Z, [x14]\n"
- "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "addvl x6, x6, #1\n"
+ "add x23, x24, x23, LSL #2\n"
+ "ld1w { z11.s }, p2/Z, [x5, x17, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x13]\n"
+ "ld1w { z13.s }, p2/Z, [x16, x7, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
- "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x25, %x[n_channels]\n"
+ "movprfx z28, z25\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z23, z25\n fmla z23.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x22, %x[n_channels]\n"
"incw x21\n"
- "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "incw x25\n"
+ "movprfx z29, z25\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z30, z25\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "incw x22\n"
"mov p0.b, p2.b\n"
- "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
- "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z25\n fmla z31.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z25\n fmla z16.s, p3/M, z3.s, z9.s\n"
"incw x20\n"
- "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
- "fmla z27.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "movprfx z17, z25\n fmla z17.s, p3/M, z2.s, z9.s\n"
+ "movprfx z19, z25\n fmla z19.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, x8, LSL #2]\n"
"fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
- "fmla z28.s, p3/M, z6.s, z19.s\n"
- "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "fmla z27.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, x4, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z13.s\n"
"fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z20.s, p3/M, z0.s, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z21.s, p3/M, z6.s, z12.s\n"
- "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z18.s\n"
- "fmla z23.s, p3/M, z8.s, z15.s\n"
- "fmla z27.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z19.s\n"
- "fmla z30.s, p3/M, z4.s, z19.s\n"
- "ld1w { z24.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "fmla z31.s, p3/M, z3.s, z19.s\n"
- "fmla z21.s, p3/M, z1.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x6]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x16]\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z18.s\n"
- "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z2.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z10.s\n"
- "fmla z29.s, p3/M, z7.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z0.s, z17.s\n"
- "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z20.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z16.s\n"
- "fmla z22.s, p3/M, z4.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z19.s\n"
- "fmla z27.s, p3/M, z3.s, z17.s\n"
- "fmla z29.s, p3/M, z5.s, z9.s\n"
- "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z31.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z19.s\n"
- "fmla z21.s, p3/M, z5.s, z19.s\n"
- "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z17.s\n"
- "fmla z22.s, p3/M, z6.s, z16.s\n"
- "fmla z30.s, p3/M, z8.s, z19.s\n"
- "fmla z20.s, p3/M, z8.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z0.s, z13.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "movprfx z18, z25\n fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z20.s\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z25.s }, p3/Z, [x6]\n"
"addvl x6, x6, #1\n"
- "fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmla z28.s, p3/M, z3.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z18.s\n"
- "fmla z27.s, p3/M, z4.s, z18.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "fmla z23.s, p3/M, z7.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z18.s\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z27.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z20.s\n"
+ "fmla z19.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z24.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z20.s\n"
+ "fmla z18.s, p3/M, z0.s, z20.s\n"
+ "fmla z17.s, p3/M, z1.s, z20.s\n"
+ "fmla z28.s, p3/M, z0.s, z27.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z21.s }, p2/Z, [x16]\n"
+ "fmla z29.s, p3/M, z1.s, z24.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "fmla z28.s, p3/M, z2.s, z24.s\n"
+ "fmla z23.s, p3/M, z1.s, z27.s\n"
+ "ld1w { z13.s }, p2/Z, [x16, x17, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x14]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z16.s, p3/M, z2.s, z13.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z27.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z23.s, p3/M, z3.s, z21.s\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "ld1w { z22.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x13, x4, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z27.s\n"
+ "fmla z19.s, p3/M, z3.s, z27.s\n"
+ "ld1w { z21.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z27.s\n"
+ "fmla z16.s, p3/M, z6.s, z27.s\n"
+ "fmla z17.s, p3/M, z5.s, z27.s\n"
+ "fmla z30.s, p3/M, z8.s, z27.s\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "fmla z19.s, p3/M, z5.s, z22.s\n"
+ "fmla z18.s, p3/M, z6.s, z20.s\n"
+ "fmla z16.s, p3/M, z8.s, z22.s\n"
+ "fmla z31.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
"addvl x16, x16, #1\n"
- "fmla z29.s, p3/M, z4.s, z17.s\n"
- "fmla z31.s, p3/M, z2.s, z17.s\n"
- "fmla z20.s, p3/M, z1.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z4.s, z21.s\n"
+ "fmla z30.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z8.s, z20.s\n"
+ "fmla z19.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x5, x7, LSL #2]\n"
"addvl x5, x5, #1\n"
- "fmla z22.s, p3/M, z3.s, z11.s\n"
- "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z17.s, p3/M, z4.s, z21.s\n"
+ "fmla z30.s, p3/M, z7.s, z21.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
- "fmla z23.s, p3/M, z4.s, z19.s\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z1.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x7]\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z19.s\n"
- "addvl x7, x7, #1\n"
- "fmla z22.s, p3/M, z5.s, z19.s\n"
- "fmla z27.s, p3/M, z6.s, z18.s\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "fmla z21.s, p3/M, z0.s, z18.s\n"
- "fmla z23.s, p3/M, z2.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z18.s, p3/M, z3.s, z21.s\n"
+ "fmla z23.s, p3/M, z2.s, z20.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z21.s\n"
+ "ld1w { z11.s }, p2/Z, [x15]\n"
+ "fmla z28.s, p3/M, z1.s, z20.s\n"
+ "fmla z29.s, p3/M, z0.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, x17, LSL #2]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z16.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x15, x7, LSL #2]\n"
+ "fmla z18.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z17.s, p3/M, z0.s, z11.s\n"
+ "fmla z19.s, p3/M, z2.s, z20.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
"whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z29.s, p3/M, z8.s, z17.s\n"
- "fmla z20.s, p3/M, z5.s, z17.s\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "fmla z22.s, p3/M, z7.s, z16.s\n"
- "addvl x14, x14, #1\n"
- "cmp x25, %x[n_channels]\n"
- "fmla z23.s, p3/M, z6.s, z16.s\n"
- "fmax z27.s, p3/M, z27.s, z26.s\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "fmin z27.s, p3/M, z27.s, z14.s\n"
- ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
- "ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
- ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
- "ld1w { z12.s }, p1/Z, [x14]\n"
- "st1w { z27.s }, p0, [x26]\n"
- "ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z20.s\n"
+ "fmla z16.s, p3/M, z5.s, z20.s\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "addvl x13, x13, #1\n"
+ "cmp x22, %x[n_channels]\n"
+ "ld1w { z11.s }, p1/Z, [x5, x17, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z13.s\n"
+ "fmla z18.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z12.s }, p1/Z, [x13]\n"
+ "fmla z19.s, p3/M, z6.s, z13.s\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ ".inst 0xc1aec9fc // fclamp { z28.s-z31.s }, z15.s, z14.s\n"
+ "ld1w { z13.s }, p1/Z, [x16, x7, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ ".inst 0xc1aec9f0 // fclamp { z16.s-z19.s }, z15.s, z14.s\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z23.s }, p0, [x26]\n"
"st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x25, LSL #2]\n"
"addvl x26, x26, #1\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z30.s }, p0, [x24]\n"
"st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x24, x25, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z21.s }, p0, [x23]\n"
- "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z17.s }, p0, [x23]\n"
+ "st1w { z18.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z19.s }, p0, [x23, x25, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
- "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
+ "movprfx z20, z25\n fmla z20.s, p3/M, z7.s, z9.s\n"
+ "movprfx z24, z25\n fmla z24.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x3, x3, #0x1\n"
- "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x2, #0x1\n"
- "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
- "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x3, x20\n"
- "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
- "fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z29.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
- "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
- "csel x2, x2, x21, LT\n"
- "fmla z28.s, p3/M, z6.s, z17.s\n"
- "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
"mov p0.b, p2.b\n"
+ "movprfx z21, z25\n fmla z21.s, p3/M, z6.s, z9.s\n"
+ "movprfx z22, z25\n fmla z22.s, p3/M, z5.s, z9.s\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z23, z25\n fmla z23.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z25\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z29, z25\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z25\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z20.s, p3/M, z4.s, z13.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x15, x8, LSL #2]\n"
+ "add x20, x2, #0x1\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, x4, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z13.s\n"
+ "cmp x3, x22\n"
+ "fmla z23.s, p3/M, z1.s, z13.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "csel x2, x2, x20, LT\n"
"csel x3, x3, XZR, LT\n"
- "fmla z25.s, p3/M, z5.s, z13.s\n"
- "fmla z29.s, p3/M, z3.s, z13.s\n"
- "cmp x2, x20\n"
- "fmla z30.s, p3/M, z2.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z13.s\n"
- "fmla z20.s, p3/M, z0.s, z13.s\n"
- "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
- "fmla z21.s, p3/M, z6.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
- "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z19.s\n"
- "fmla z23.s, p3/M, z8.s, z16.s\n"
- "fmla z25.s, p3/M, z7.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z17.s\n"
- "fmla z30.s, p3/M, z4.s, z17.s\n"
- "fmla z31.s, p3/M, z3.s, z17.s\n"
- "fmla z21.s, p3/M, z1.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x6]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x16]\n"
- "fmla z20.s, p3/M, z4.s, z27.s\n"
- "fmla z25.s, p3/M, z1.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
- "fmla z22.s, p3/M, z2.s, z27.s\n"
- "fmla z23.s, p3/M, z1.s, z27.s\n"
- "fmla z28.s, p3/M, z8.s, z27.s\n"
- "fmla z29.s, p3/M, z7.s, z27.s\n"
- "fmla z31.s, p3/M, z5.s, z27.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "movprfx z30, z25\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "cmp x2, x21\n"
+ "fmla z20.s, p3/M, z6.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z13.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z3.s, z18.s\n"
"fmla z30.s, p3/M, z0.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z17.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x16]\n"
+ "fmla z21.s, p3/M, z1.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
"fmla z20.s, p3/M, z2.s, z16.s\n"
- "fmla z21.s, p3/M, z3.s, z17.s\n"
- "fmla z22.s, p3/M, z4.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z19.s\n"
- "fmla z25.s, p3/M, z3.s, z18.s\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z17.s\n"
- "fmla z31.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z20.s, p3/M, z6.s, z19.s\n"
- "fmla z21.s, p3/M, z5.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x17, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "fmla z21.s, p3/M, z7.s, z19.s\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "fmla z20.s, p3/M, z8.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, x17, LSL #2]\n"
"fmla z22.s, p3/M, z6.s, z16.s\n"
- "fmla z30.s, p3/M, z8.s, z19.s\n"
- "fmla z20.s, p3/M, z8.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmla z28.s, p3/M, z3.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z17.s\n"
- "fmla z25.s, p3/M, z4.s, z17.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "fmla z23.s, p3/M, z7.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z17.s\n"
- "fmla z28.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
"ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z18.s\n"
- "fmla z31.s, p3/M, z2.s, z18.s\n"
- "fmla z20.s, p3/M, z1.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
- "fmla z21.s, p3/M, z4.s, z17.s\n"
- "fmla z22.s, p3/M, z3.s, z17.s\n"
- "fmla z25.s, p3/M, z2.s, z16.s\n"
- "fmla z23.s, p3/M, z4.s, z19.s\n"
- "fmla z30.s, p3/M, z7.s, z17.s\n"
- "fmla z31.s, p3/M, z6.s, z17.s\n"
- "fmla z28.s, p3/M, z1.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x7]\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z20.s, p3/M, z7.s, z19.s\n"
- "fmla z22.s, p3/M, z5.s, z19.s\n"
- "fmla z25.s, p3/M, z6.s, z18.s\n"
- "fmla z21.s, p3/M, z0.s, z18.s\n"
- "fmla z23.s, p3/M, z2.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z18.s\n"
- "fmla z29.s, p3/M, z8.s, z17.s\n"
- "fmla z20.s, p3/M, z5.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "fmla z22.s, p3/M, z7.s, z16.s\n"
- "fmla z23.s, p3/M, z6.s, z16.s\n"
- "fmax z25.s, p3/M, z25.s, z26.s\n"
- "fmin z25.s, p3/M, z25.s, z14.s\n"
- ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
- "st1w { z25.s }, p0, [x26]\n"
- ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
- "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
- "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z30.s }, p0, [x24]\n"
- "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
- "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z21.s }, p0, [x23]\n"
- "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
- "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z19.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z18.s\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z18.s\n"
+ "fmla z23.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z5.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x15]\n"
+ "fmla z20.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x15, x17, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z19.s\n"
+ "fmla z30.s, p3/M, z5.s, z19.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aec9f4 // fclamp { z20.s-z23.s }, z15.s, z14.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ ".inst 0xc1aec9fc // fclamp { z28.s-z31.s }, z15.s, z14.s\n"
+ "st1w { z22.s }, p0, [x24]\n"
+ "st1w { z24.s }, p0, [x26]\n"
+ "st1w { z20.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z21.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z23.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23]\n"
+ "st1w { z30.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x23, x25, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 2c868b6cf3..ee896b6ba1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,30 +87,30 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
activation_min, activation_max);
__asm__ __volatile__(
- "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
- "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z20.s }, p3/Z, [x8]\n"
- "addvl x8, x8, #1\n"
- "ldp x24, x23, [x17, #0x0]\n"
- "ldp x22, x21, [x17, #0x10]\n"
- "cntw x16\n"
- ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
- "addvl x8, x8, #4\n"
- "ldr x20, [x17, #0x20]\n"
- "mov x15, #0x0\n"
+ "ldp x24, x23, [x16, #0x0]\n"
+ "ldp x22, x21, [x16, #0x10]\n"
+ "cntw x14\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
- "addvl x8, x8, #4\n"
- "cmp x16, %x[n_channels]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1w { z30.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ldr x20, [x16, #0x20]\n"
+ "cmp x14, %x[n_channels]\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "sub x12, XZR, x14\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
"ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x13, XZR, x16\n"
- "ld1w { z8.s }, p3/Z, [x8]\n"
- "addvl x8, x8, #1\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
"ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
"ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
"ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
@@ -118,323 +118,323 @@ void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x22, [x17, #0x30]\n"
- "incw x13\n"
- "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ldr x25, [x17, #0x38]\n"
+ "movprfx z31, z30\n fmla z31.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z30\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x16, #0x30]\n"
+ "incw x12\n"
+ "movprfx z25, z30\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x27, [x16, #0x38]\n"
"mov p1.b, p2.b\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z20, z30\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x16, #0x28]\n"
+ "whilelt p0.s, x14, %x[n_channels]\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z2.s, z9.s\n"
+ "movprfx z23, z30\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x16, #0x48]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x21, [x17, #0x28]\n"
- "whilelt p0.s, x16, %x[n_channels]\n"
- "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x17, #0x48]\n"
- "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
- "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x20, [x17, #0x40]\n"
- "fmla z21.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z23.s\n"
- "ldr x24, [x17, #0x50]\n"
- "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x23, [x17, #0x58]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x22, [x17, #0x60]\n"
- "fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x17, #0x70]\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z21.s, p3/M, z7.s, z23.s\n"
- "ldr x21, [x17, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x25, [x16, #0x58]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ld1w { z30.s }, p3/Z, [x17]\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ldr x22, [x16, #0x70]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
"ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x27, [x17, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z23.s\n"
- "fmla z27.s, p3/M, z3.s, z23.s\n"
- "ldr x20, [x17, #0x80]\n"
- "ld1w { z20.s }, p3/Z, [x8]\n"
- "fmla z30.s, p3/M, z0.s, z23.s\n"
- "fmla z28.s, p3/M, z4.s, z19.s\n"
- "ldr x11, [x17, #0x88]\n"
- "addvl x8, x8, #1\n"
- "fmla z29.s, p3/M, z1.s, z23.s\n"
+ "fmla z27.s, p3/M, z3.s, z17.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z20.s, p3/M, z4.s, z19.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z31.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z0.s, z18.s\n"
+ "ldr x11, [x16, #0x88]\n"
"fmla z21.s, p3/M, z1.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
- "ldr x26, [x17, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
"fmla z25.s, p3/M, z1.s, z16.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
- "ldr x25, [x17, #0x98]\n"
- "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
"fmla z27.s, p3/M, z5.s, z19.s\n"
- "fmla z30.s, p3/M, z2.s, z19.s\n"
- "ldr x24, [x17, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z18.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ldr x10, [x14, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z23.s, p3/M, z1.s, z19.s\n"
+ "ldr x9, [x13, #0x0]\n"
+ "fmla z22.s, p3/M, z2.s, z19.s\n"
+ "ldr x28, [x13, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z9.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z17.s\n"
"fmla z25.s, p3/M, z7.s, z19.s\n"
- "ldr x9, [x14, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z19.s\n"
- "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ldr x26, [x13, #0x10]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "ldr x25, [x13, #0x18]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z25.s, p3/M, z5.s, z9.s\n"
"ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ldr x23, [x17, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z17.s\n"
- "fmla z27.s, p3/M, z7.s, z16.s\n"
- "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x22, [x17, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z16.s\n"
- "fmla z30.s, p3/M, z4.s, z16.s\n"
- "ldr x28, [x14, #0x10]\n"
- "fmla z21.s, p3/M, z3.s, z18.s\n"
- "fmla z25.s, p3/M, z5.s, z11.s\n"
- "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
- "ldr x21, [x17, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "fmla z31.s, p3/M, z3.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
- "ldr x20, [x17, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z16.s\n"
- "fmla z28.s, p3/M, z8.s, z15.s\n"
- "ldr x27, [x14, #0x18]\n"
- "fmla z30.s, p3/M, z6.s, z19.s\n"
- "fmla z24.s, p3/M, z3.s, z23.s\n"
- "fmla z27.s, p3/M, z0.s, z23.s\n"
- "fmla z31.s, p3/M, z5.s, z15.s\n"
- "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z7.s, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z4.s, z23.s\n"
- "fmla z26.s, p3/M, z1.s, z23.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z17.s\n"
- "fmla z27.s, p3/M, z2.s, z17.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
- "ldr x26, [x17, #0x20]\n"
- "fmla z21.s, p3/M, z2.s, z17.s\n"
- "fmla z26.s, p3/M, z7.s, z16.s\n"
- "fmla z27.s, p3/M, z6.s, z16.s\n"
- "fmla z29.s, p3/M, z4.s, z16.s\n"
- "fmla z30.s, p3/M, z3.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z6.s, z18.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "fmla z24.s, p3/M, z1.s, z17.s\n"
- "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ldr x21, [x16, #0xb8]\n"
+ "fmla z27.s, p3/M, z7.s, z29.s\n"
+ "fmla z20.s, p3/M, z6.s, z29.s\n"
"ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
- "fmax z21.s, p3/M, z21.s, z22.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "fmla z30.s, p3/M, z5.s, z16.s\n"
- "fmla z29.s, p3/M, z0.s, z18.s\n"
- "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z22.s, p3/M, z4.s, z29.s\n"
+ "fmla z21.s, p3/M, z5.s, z29.s\n"
+ "fmla z23.s, p3/M, z3.s, z29.s\n"
+ "fmla z26.s, p3/M, z8.s, z29.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z17.s\n"
+ "fmla z20.s, p3/M, z8.s, z18.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z13.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z16.s\n"
+ "fmla z25.s, p3/M, z4.s, z16.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z28.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x16, #0x20]\n"
+ "fmla z22.s, p3/M, z8.s, z13.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z23.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z28.s\n"
+ "fmla z24.s, p3/M, z1.s, z28.s\n"
+ "fmla z27.s, p3/M, z6.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z28.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
"fmla z27.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ldp x22, x21, [x17, #0x0]\n"
- "fmla z26.s, p3/M, z3.s, z18.s\n"
- "fmla z25.s, p3/M, z8.s, z17.s\n"
- "ldp x25, x24, [x17, #0x10]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldp x23, x22, [x16, #0x0]\n"
+ "fmla z23.s, p3/M, z2.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "ldp x21, x20, [x16, #0x10]\n"
"incw x15\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
- "ldr x20, [x14, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z16.s\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z20.s, p3/M, z5.s, z18.s\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x23, x14, LSL #2]\n"
"whilelt p2.s, x15, %x[n_channels]\n"
- "fmla z31.s, p3/M, z6.s, z16.s\n"
- ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
- "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
- "ldr x23, [x14, #0x28]\n"
- "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
- "ldr x22, [x14, #0x30]\n"
- "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
- ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
- "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
- "ldr x21, [x14, #0x38]\n"
- "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
- "ldr x20, [x14, #0x40]\n"
- "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
- "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
- "incw x16\n"
- "cmp x16, %x[n_channels]\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
- ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
- "addvl x8, x8, #4\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
- ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
- "addvl x8, x8, #4\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x8]\n"
- "addvl x8, x8, #1\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z10.s }, p0/Z, [x22, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z11.s }, p0/Z, [x21, x14, LSL #2]\n"
+ ".inst 0xc1aec9f8 // fclamp { z24.s-z27.s }, z15.s, z14.s\n"
+ "ld1w { z12.s }, p0/Z, [x20, x14, LSL #2]\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "ld1w { z13.s }, p0/Z, [x24, x14, LSL #2]\n"
+ "incw x14\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x14, %x[n_channels]\n"
+ ".inst 0xc1aec9f4 // fclamp { z20.s-z23.s }, z15.s, z14.s\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z24.s }, p1, [x28, x12, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "st1w { z31.s }, p1, [x9, x12, LSL #2]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1w { z25.s }, p1, [x26, x12, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "st1w { z26.s }, p1, [x25, x12, LSL #2]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "st1w { z27.s }, p1, [x20, x12, LSL #2]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "st1w { z20.s }, p1, [x23, x12, LSL #2]\n"
+ "st1w { z21.s }, p1, [x22, x12, LSL #2]\n"
+ "st1w { z22.s }, p1, [x21, x12, LSL #2]\n"
+ "st1w { z23.s }, p1, [x20, x12, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
- "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
- "ldr x23, [x17, #0x30]\n"
- "incw x13\n"
- "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ldr x22, [x17, #0x38]\n"
+ "movprfx z20, z30\n fmla z20.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z30\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x16, #0x30]\n"
+ "incw x12\n"
+ "movprfx z25, z30\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x27, [x16, #0x38]\n"
"mov p0.b, p2.b\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z30\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x16, #0x28]\n"
+ "movprfx z29, z30\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z30\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x16, #0x48]\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
"fmla z24.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "ldr x21, [x17, #0x28]\n"
- "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
- "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
- "ldr x20, [x17, #0x48]\n"
- "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x40]\n"
"fmla z25.s, p3/M, z2.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
- "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
- "ldr x20, [x17, #0x40]\n"
- "fmla z21.s, p3/M, z5.s, z13.s\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "ldr x25, [x17, #0x50]\n"
- "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z3.s, z13.s\n"
- "ldr x24, [x17, #0x58]\n"
+ "ld1w { z19.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ldr x26, [x16, #0x50]\n"
"fmla z27.s, p3/M, z1.s, z13.s\n"
- "ldr x23, [x17, #0x60]\n"
"fmla z28.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x25, [x16, #0x58]\n"
"fmla z29.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x17, #0x70]\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
- "fmla z21.s, p3/M, z7.s, z18.s\n"
- "ldr x22, [x17, #0x68]\n"
- "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "fmla z20.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z19.s\n"
+ "ldr x22, [x16, #0x70]\n"
"fmla z31.s, p3/M, z8.s, z16.s\n"
"ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x21, [x17, #0x78]\n"
- "fmla z26.s, p3/M, z4.s, z18.s\n"
- "fmla z27.s, p3/M, z3.s, z18.s\n"
- "ldr x20, [x17, #0x80]\n"
- "fmla z30.s, p3/M, z0.s, z18.s\n"
- "fmla z28.s, p3/M, z4.s, z19.s\n"
- "ldr x11, [x17, #0x88]\n"
- "fmla z29.s, p3/M, z1.s, z18.s\n"
- "fmla z21.s, p3/M, z1.s, z17.s\n"
- "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
- "ldr x10, [x17, #0x90]\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z3.s, z19.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "fmla z30.s, p3/M, z0.s, z19.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "ldr x11, [x16, #0x88]\n"
+ "fmla z29.s, p3/M, z1.s, z19.s\n"
"fmla z25.s, p3/M, z1.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
- "ldr x9, [x17, #0x98]\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z5.s, z19.s\n"
- "fmla z30.s, p3/M, z2.s, z19.s\n"
- "ldr x28, [x17, #0xa0]\n"
- "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x10, [x16, #0x90]\n"
+ "fmla z27.s, p3/M, z5.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ldr x9, [x13, #0x0]\n"
+ "fmla z30.s, p3/M, z2.s, z18.s\n"
+ "ldr x28, [x13, #0x8]\n"
+ "fmla z20.s, p3/M, z1.s, z17.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "ldr x25, [x13, #0x10]\n"
"fmla z28.s, p3/M, z2.s, z17.s\n"
- "ldr x27, [x14, #0x0]\n"
- "fmla z24.s, p3/M, z8.s, z19.s\n"
- "fmla z25.s, p3/M, z7.s, z19.s\n"
- "ldr x26, [x14, #0x8]\n"
- "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ldr x24, [x13, #0x18]\n"
+ "fmla z24.s, p3/M, z8.s, z18.s\n"
"fmla z29.s, p3/M, z3.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ldr x25, [x17, #0xa8]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
"fmla z26.s, p3/M, z6.s, z16.s\n"
- "fmla z27.s, p3/M, z7.s, z19.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x23, [x17, #0xc0]\n"
- "fmla z28.s, p3/M, z6.s, z19.s\n"
- "fmla z30.s, p3/M, z4.s, z19.s\n"
- "ldr x24, [x14, #0x10]\n"
- "fmla z21.s, p3/M, z3.s, z20.s\n"
+ "fmla z20.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x16, #0xb0]\n"
"fmla z25.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
- "ldr x22, [x17, #0xb0]\n"
- "fmla z29.s, p3/M, z5.s, z19.s\n"
- "fmla z31.s, p3/M, z3.s, z19.s\n"
"ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ldr x20, [x17, #0xb8]\n"
- "fmla z26.s, p3/M, z8.s, z19.s\n"
- "fmla z28.s, p3/M, z8.s, z17.s\n"
- "ldr x21, [x14, #0x18]\n"
+ "ldr x21, [x16, #0xb8]\n"
+ "fmla z27.s, p3/M, z7.s, z18.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z30.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "fmla z31.s, p3/M, z3.s, z18.s\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
"fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z24.s, p3/M, z3.s, z18.s\n"
- "fmla z27.s, p3/M, z0.s, z18.s\n"
- "fmla z31.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
"fmla z29.s, p3/M, z7.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z4.s, z18.s\n"
- "fmla z26.s, p3/M, z1.s, z18.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z4.s, z17.s\n"
- "fmla z27.s, p3/M, z2.s, z17.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "fmla z30.s, p3/M, z8.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z2.s, z17.s\n"
- "fmla z26.s, p3/M, z7.s, z16.s\n"
- "fmla z27.s, p3/M, z6.s, z16.s\n"
- "fmla z29.s, p3/M, z4.s, z16.s\n"
- "fmla z30.s, p3/M, z3.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z21.s, p3/M, z6.s, z18.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z16.s\n"
+ "fmla z25.s, p3/M, z4.s, z16.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z7.s, z19.s\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z31.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z17.s\n"
"fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z27.s, p3/M, z6.s, z19.s\n"
"fmla z25.s, p3/M, z0.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
- "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
"fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z20.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
"fmla z30.s, p3/M, z5.s, z16.s\n"
- "fmla z29.s, p3/M, z0.s, z18.s\n"
- "fmla z31.s, p3/M, z2.s, z17.s\n"
"fmla z27.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z3.s, z18.s\n"
- "fmla z25.s, p3/M, z8.s, z17.s\n"
- "fmin z21.s, p3/M, z21.s, z14.s\n"
- "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
- "ldr x20, [x14, #0x20]\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z28.s, p3/M, z5.s, z18.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
"fmla z29.s, p3/M, z8.s, z16.s\n"
"fmla z30.s, p3/M, z7.s, z16.s\n"
"fmla z31.s, p3/M, z6.s, z16.s\n"
- ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
- "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
- "ldr x23, [x14, #0x28]\n"
- "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
- "ldr x22, [x14, #0x30]\n"
- ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
- "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
- "ldr x21, [x14, #0x38]\n"
- "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
- "ldr x20, [x14, #0x40]\n"
- "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
- "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
- "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
- "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ ".inst 0xc1aec9f8 // fclamp { z24.s-z27.s }, z15.s, z14.s\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ ".inst 0xc1aec9fc // fclamp { z28.s-z31.s }, z15.s, z14.s\n"
+ "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "st1w { z20.s }, p0, [x9, x12, LSL #2]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1w { z25.s }, p0, [x25, x12, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "st1w { z27.s }, p0, [x20, x12, LSL #2]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x12, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x12, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x12, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index efd37c38ec..cf4a0d5b9b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,98 +88,98 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ptrue p3.b\n"
- ".inst 0x25207810 // ptrue pn8.b\n"
"mov x2, #0x0\n"
"mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x22, #0x4\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "add x7, x4, x4\n"
"mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x21, LSL #2\n"
- "add x7, x6, x21, LSL #2\n"
- "add x8, x4, x4\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x21, LSL #2\n"
- "add x15, x8, x4\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #2\n"
+ "add x15, x17, x4\n"
"add x14, x16, x21, LSL #2\n"
- "add x13, x15, x4\n"
- "add x12, x14, x21, LSL #2\n"
- "add x11, x13, x4\n"
+ "add x13, x14, x21, LSL #2\n"
+ "add x12, x13, x21, LSL #2\n"
+ "add x11, x12, x21, LSL #2\n"
"cbnz x3, 2f\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x21, x20, x3\n"
- "sub x21, x21, #0x1\n"
"lsl x10, %x[n_channels], #0x2\n"
- "mov x20, #0x10\n"
- "and x21, x21, #0x3fffff\n"
- "mul x20, x20, x4\n"
- "orr x10, x10, x21, LSL #22\n"
- "orr x10, x10, x20, LSL #38\n"
- "add x9, x7, x8, LSL #2\n"
- "add x28, x5, x11, LSL #2\n"
- "add x27, x7, x15, LSL #2\n"
- "add x26, x12, x11, LSL #2\n"
- "add x25, x16, x8, LSL #2\n"
+ "mov x21, #0x10\n"
+ "mul x21, x21, x4\n"
+ "add x9, x14, x7, LSL #2\n"
+ "add x28, x5, x15, LSL #2\n"
+ "add x27, x14, x8, LSL #2\n"
+ "sub x20, x20, x3\n"
+ "add x26, x11, x15, LSL #2\n"
+ "sub x20, x20, #0x1\n"
+ "add x25, x13, x7, LSL #2\n"
+ "and x20, x20, #0x3fffff\n"
"add x24, x5, x4, LSL #2\n"
- "add x23, x5, x13, LSL #2\n"
- "add x22, x16, x15, LSL #2\n"
- "add x21, x6, x11, LSL #2\n"
- "add x20, x6, x8, LSL #2\n"
+ "orr x10, x10, x20, LSL #22\n"
+ "add x23, x5, x17, LSL #2\n"
+ "orr x10, x10, x21, LSL #38\n"
+ "add x22, x13, x8, LSL #2\n"
+ "add x21, x16, x15, LSL #2\n"
+ "add x20, x16, x7, LSL #2\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- "add x9, x14, x11, LSL #2\n"
+ "add x9, x12, x15, LSL #2\n"
".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
- "add x28, x6, x15, LSL #2\n"
+ "add x28, x16, x8, LSL #2\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x12, x4, LSL #2\n"
- ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ "add x27, x11, x4, LSL #2\n"
+ ".inst 0xf8aa497a // rprfm pldonce, x10, [x11]\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x7, x4, LSL #2\n"
+ "add x26, x14, x4, LSL #2\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x12, x13, LSL #2\n"
+ "add x25, x11, x17, LSL #2\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
- "add x24, x7, x13, LSL #2\n"
+ "add x24, x14, x17, LSL #2\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
- "add x23, x5, x8, LSL #2\n"
+ "add x23, x5, x7, LSL #2\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x16, x4, LSL #2\n"
- ".inst 0xf8aa48da // rprfm pldonce, x10, [x6]\n"
+ "add x22, x13, x4, LSL #2\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x5, x15, LSL #2\n"
- ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
+ "add x21, x5, x8, LSL #2\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x16, x13, LSL #2\n"
+ "add x20, x13, x17, LSL #2\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- "add x9, x7, x11, LSL #2\n"
+ "add x9, x14, x15, LSL #2\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
- "add x28, x14, x8, LSL #2\n"
+ "add x28, x12, x7, LSL #2\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
- "add x27, x16, x11, LSL #2\n"
+ "add x27, x13, x15, LSL #2\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
- "add x26, x12, x8, LSL #2\n"
+ "add x26, x11, x7, LSL #2\n"
".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
- "add x25, x14, x15, LSL #2\n"
+ "add x25, x12, x8, LSL #2\n"
".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
- "add x24, x12, x15, LSL #2\n"
+ "add x24, x11, x8, LSL #2\n"
".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
- "add x23, x6, x4, LSL #2\n"
+ "add x23, x16, x4, LSL #2\n"
".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
- "add x22, x6, x13, LSL #2\n"
+ "add x22, x16, x17, LSL #2\n"
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
- "add x21, x14, x4, LSL #2\n"
- ".inst 0xf8aa48fa // rprfm pldonce, x10, [x7]\n"
+ "add x21, x12, x4, LSL #2\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
- "add x20, x14, x13, LSL #2\n"
+ "add x20, x12, x17, LSL #2\n"
".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
- ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
+ ".inst 0xf8aa49ba // rprfm pldonce, x10, [x13]\n"
".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
@@ -190,67 +190,67 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
- "mov x20, #0x4\n"
- "ld1w { z14.s }, p3/Z, [x17]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x21, #0x4\n"
+ "ld1w { z14.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
"ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x21, x3, x9, x21\n" // offset += tile_j * ld_output_col
- "mul x21, x21, x20\n" // offset *= output_tile_size
- "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cntw x22\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
"ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x28, x28, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "addvl x17, x17, #1\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "add x27, x28, x22, LSL #2\n"
- "cntw x26\n"
- "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "addvl x17, x17, #4\n"
- "add x25, x27, x22, LSL #2\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "add x24, x9, x9\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
- "addvl x17, x17, #4\n"
- "cmp x26, %x[n_channels]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "add x23, x25, x22, LSL #2\n"
- "add x22, x24, x9\n"
- "ld1w { z10.s }, p2/Z, [x5]\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x20, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x22, %x[n_channels]\n"
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x20, x3, x9, x20\n" // offset += tile_j * ld_output_col
+ "add x27, x9, x9\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mul x20, x20, x21\n" // offset *= output_tile_size
+ "add x26, x27, x9\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
"mov x21, #0x0\n"
- "sub x20, XZR, x26\n"
- "ld1w { z11.s }, p2/Z, [x5, x11, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
- "addvl x17, x17, #1\n"
+ "ld1w { z9.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "add x25, x28, x23, LSL #2\n"
+ "sub x20, XZR, x22\n"
+ "ld1w { z10.s }, p2/Z, [x5]\n"
+ "add x24, x25, x23, LSL #2\n"
+ "ld1w { z11.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "addvl x6, x6, #1\n"
+ "add x23, x24, x23, LSL #2\n"
+ "ld1w { z12.s }, p2/Z, [x14, x8, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
"movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
"movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x26, %x[n_channels]\n"
+ "whilelt p1.s, x22, %x[n_channels]\n"
"incw x21\n"
"movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
"movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
- "incw x26\n"
+ "incw x22\n"
"mov p0.b, p2.b\n"
"movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
- "fmla z25.s, p3/M, z5.s, z12.s\n"
- "incw x20\n"
"movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "incw x20\n"
"movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
"movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
"movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x13, x7, LSL #2]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z19.s }, p2/Z, [x12]\n"
+ "ld1w { z19.s }, p2/Z, [x11]\n"
"fmla z26.s, p3/M, z4.s, z12.s\n"
"fmla z17.s, p3/M, z2.s, z12.s\n"
- "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x11, x15, LSL #2]\n"
"fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
"movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z25.s, p3/M, z7.s, z9.s\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x8, LSL #2]\n"
"fmla z30.s, p3/M, z7.s, z12.s\n"
"fmla z31.s, p3/M, z6.s, z12.s\n"
"movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
@@ -258,24 +258,24 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
"movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
"fmla z26.s, p3/M, z6.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x5, x17, LSL #2]\n"
"fmla z17.s, p3/M, z4.s, z9.s\n"
"fmla z18.s, p3/M, z3.s, z9.s\n"
"movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
"movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
- "ld1w { z14.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
+ "ld1w { z14.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
"fmla z24.s, p3/M, z8.s, z9.s\n"
"fmla z16.s, p3/M, z5.s, z9.s\n"
"fmla z20.s, p3/M, z2.s, z9.s\n"
"fmla z25.s, p3/M, z8.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x6]\n"
+ "ld1w { z9.s }, p2/Z, [x16]\n"
"fmla z28.s, p3/M, z1.s, z10.s\n"
"fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14]\n"
+ "ld1w { z12.s }, p2/Z, [x12]\n"
"fmla z26.s, p3/M, z7.s, z11.s\n"
"fmla z27.s, p3/M, z6.s, z11.s\n"
"fmla z17.s, p3/M, z5.s, z11.s\n"
@@ -284,52 +284,52 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z21.s, p3/M, z2.s, z11.s\n"
"fmla z22.s, p3/M, z1.s, z11.s\n"
"fmla z23.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x16, x7, LSL #2]\n"
"fmla z24.s, p3/M, z0.s, z9.s\n"
"fmla z16.s, p3/M, z6.s, z12.s\n"
"fmla z20.s, p3/M, z3.s, z12.s\n"
- "fmla z25.s, p3/M, z1.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
"fmla z31.s, p3/M, z5.s, z10.s\n"
"fmla z27.s, p3/M, z2.s, z10.s\n"
"fmla z29.s, p3/M, z4.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x16, x8, LSL #2]\n"
"fmla z30.s, p3/M, z3.s, z11.s\n"
"fmla z26.s, p3/M, z0.s, z11.s\n"
"fmla z19.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x11, x4, LSL #2]\n"
"fmla z24.s, p3/M, z2.s, z11.s\n"
"fmla z25.s, p3/M, z2.s, z10.s\n"
"fmla z28.s, p3/M, z5.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x14, x4, LSL #2]\n"
"fmla z30.s, p3/M, z4.s, z10.s\n"
"fmla z31.s, p3/M, z3.s, z10.s\n"
"fmla z26.s, p3/M, z1.s, z10.s\n"
"fmla z27.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n"
"fmla z20.s, p3/M, z7.s, z12.s\n"
"fmla z21.s, p3/M, z6.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x11, x17, LSL #2]\n"
"fmla z24.s, p3/M, z4.s, z9.s\n"
"fmla z25.s, p3/M, z3.s, z9.s\n"
"fmla z16.s, p3/M, z1.s, z9.s\n"
"fmla z17.s, p3/M, z0.s, z9.s\n"
"fmla z28.s, p3/M, z7.s, z9.s\n"
"fmla z29.s, p3/M, z6.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x5, x7, LSL #2]\n"
"fmla z22.s, p3/M, z8.s, z11.s\n"
"fmla z23.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x4, LSL #2]\n"
"fmla z30.s, p3/M, z8.s, z10.s\n"
"fmla z31.s, p3/M, z7.s, z10.s\n"
"fmla z26.s, p3/M, z5.s, z10.s\n"
"fmla z27.s, p3/M, z4.s, z10.s\n"
"fmla z18.s, p3/M, z2.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x5, x8, LSL #2]\n"
"addvl x5, x5, #1\n"
"fmla z24.s, p3/M, z7.s, z11.s\n"
"fmla z25.s, p3/M, z6.s, z11.s\n"
@@ -337,153 +337,153 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z17.s, p3/M, z3.s, z11.s\n"
"fmla z20.s, p3/M, z1.s, z11.s\n"
"fmla z21.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
"fmla z28.s, p3/M, z2.s, z12.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
"fmla z30.s, p3/M, z0.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
"fmla z31.s, p3/M, z0.s, z9.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
"fmla z26.s, p3/M, z8.s, z11.s\n"
"fmla z27.s, p3/M, z7.s, z11.s\n"
"fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
"fmla z19.s, p3/M, z4.s, z11.s\n"
"fmla z23.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x7, LSL #2]\n"
"fmla z29.s, p3/M, z2.s, z9.s\n"
"fmla z30.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
- "addvl x7, x7, #1\n"
+ "ld1w { z12.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "addvl x14, x14, #1\n"
"fmla z28.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x16]\n"
+ "ld1w { z10.s }, p2/Z, [x13]\n"
"fmla z21.s, p3/M, z4.s, z11.s\n"
"fmla z22.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x7, LSL #2]\n"
"fmla z31.s, p3/M, z8.s, z12.s\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
"fmla z27.s, p3/M, z5.s, z12.s\n"
"fmla z19.s, p3/M, z2.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
- "addvl x16, x16, #1\n"
"fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x15, LSL #2]\n"
+ "addvl x13, x13, #1\n"
"fmla z16.s, p3/M, z3.s, z10.s\n"
"fmla z20.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
"fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z21.s, p3/M, z7.s, z10.s\n"
"fmla z22.s, p3/M, z6.s, z10.s\n"
"fmla z16.s, p3/M, z8.s, z11.s\n"
- "fmla z17.s, p3/M, z7.s, z11.s\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
"fmla z20.s, p3/M, z5.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n"
"fmla z19.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n"
+ "addvl x11, x11, #1\n"
"fmla z21.s, p3/M, z5.s, z11.s\n"
"fmla z22.s, p3/M, z4.s, z11.s\n"
"fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z27.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
"fmla z20.s, p3/M, z8.s, z10.s\n"
- "addvl x12, x12, #1\n"
- "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
"fmla z17.s, p3/M, z8.s, z11.s\n"
"fmla z18.s, p3/M, z7.s, z11.s\n"
"fmla z19.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x17, LSL #2]\n"
+ "addvl x16, x16, #1\n"
"fmla z21.s, p3/M, z8.s, z12.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
- "addvl x6, x6, #1\n"
"fmla z22.s, p3/M, z7.s, z12.s\n"
"fmla z23.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
"fmla z28.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
"fmla z29.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z1.s, z10.s\n"
"fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z10.s }, p2/Z, [x12, x17, LSL #2]\n"
"fmla z30.s, p3/M, z5.s, z11.s\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
"fmla z31.s, p3/M, z4.s, z11.s\n"
- "cmp x26, %x[n_channels]\n"
- "addvl x14, x14, #1\n"
"fmla z26.s, p3/M, z2.s, z11.s\n"
+ "cmp x22, %x[n_channels]\n"
+ "addvl x12, x12, #1\n"
"fmla z27.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
"fmla z16.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z11.s }, p1/Z, [x5, x15, LSL #2]\n"
"fmla z17.s, p3/M, z6.s, z12.s\n"
"fmla z20.s, p3/M, z4.s, z12.s\n"
"fmla z21.s, p3/M, z3.s, z12.s\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
"fmla z18.s, p3/M, z8.s, z10.s\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
"fmla z19.s, p3/M, z7.s, z10.s\n"
- "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
"fmla z22.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z12.s }, p1/Z, [x14, x8, LSL #2]\n"
"fmla z23.s, p3/M, z4.s, z10.s\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
"ld1w { z10.s }, p1/Z, [x5]\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
"st1w { z28.s }, p0, [x28]\n"
"st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x28, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x26, LSL #2]\n"
"addvl x28, x28, #1\n"
- "st1w { z24.s }, p0, [x27]\n"
- "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
- "addvl x27, x27, #1\n"
- "st1w { z16.s }, p0, [x25]\n"
- "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z24.s }, p0, [x25]\n"
+ "st1w { z25.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x27, LSL #2]\n"
+ "st1w { z27.s }, p0, [x25, x26, LSL #2]\n"
"addvl x25, x25, #1\n"
+ "st1w { z16.s }, p0, [x24]\n"
+ "st1w { z17.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z19.s }, p0, [x24, x26, LSL #2]\n"
+ "addvl x24, x24, #1\n"
"st1w { z20.s }, p0, [x23]\n"
"st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x26, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
"movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
"movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x3, x3, #0x1\n"
+ "mov p0.b, p2.b\n"
"movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
"movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "add x21, x2, #0x1\n"
"movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "cmp x3, x20\n"
"movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x2, x2, x21, LT\n"
"movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
"movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
- "mov p0.b, p2.b\n"
+ "ld1w { z9.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "add x20, x2, #0x1\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x12]\n"
- "csel x3, x3, XZR, LT\n"
+ "ld1w { z17.s }, p2/Z, [x11]\n"
+ "cmp x3, x22\n"
"fmla z22.s, p3/M, z4.s, z12.s\n"
"fmla z29.s, p3/M, z2.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
- "cmp x2, x20\n"
+ "ld1w { z18.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "csel x2, x2, x20, LT\n"
"fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "csel x3, x3, XZR, LT\n"
+ "cmp x2, x21\n"
"movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
- "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z21.s, p3/M, z7.s, z9.s\n"
- "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x13, x8, LSL #2]\n"
"fmla z26.s, p3/M, z7.s, z12.s\n"
"fmla z27.s, p3/M, z6.s, z12.s\n"
"movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
@@ -491,7 +491,7 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
"movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
"fmla z22.s, p3/M, z6.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x5, x17, LSL #2]\n"
"fmla z29.s, p3/M, z4.s, z9.s\n"
"fmla z30.s, p3/M, z3.s, z9.s\n"
"movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
@@ -500,13 +500,13 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z28.s, p3/M, z5.s, z9.s\n"
"fmla z16.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z8.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x6]\n"
+ "ld1w { z14.s }, p2/Z, [x16]\n"
"fmla z24.s, p3/M, z1.s, z10.s\n"
"fmla z25.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x16, x15, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z12.s\n"
"fmla z27.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14]\n"
+ "ld1w { z12.s }, p2/Z, [x12]\n"
"fmla z22.s, p3/M, z7.s, z11.s\n"
"fmla z23.s, p3/M, z6.s, z11.s\n"
"fmla z29.s, p3/M, z5.s, z11.s\n"
@@ -515,117 +515,117 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z17.s, p3/M, z2.s, z11.s\n"
"fmla z18.s, p3/M, z1.s, z11.s\n"
"fmla z19.s, p3/M, z0.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x16, x7, LSL #2]\n"
"fmla z20.s, p3/M, z0.s, z14.s\n"
"fmla z28.s, p3/M, z6.s, z12.s\n"
"fmla z16.s, p3/M, z3.s, z12.s\n"
- "fmla z21.s, p3/M, z1.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
"fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
"fmla z27.s, p3/M, z5.s, z10.s\n"
"fmla z23.s, p3/M, z2.s, z10.s\n"
"fmla z25.s, p3/M, z4.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x16, x8, LSL #2]\n"
"fmla z26.s, p3/M, z3.s, z9.s\n"
"fmla z22.s, p3/M, z0.s, z9.s\n"
"fmla z31.s, p3/M, z8.s, z11.s\n"
"fmla z19.s, p3/M, z5.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x4, LSL #2]\n"
"fmla z20.s, p3/M, z2.s, z9.s\n"
"fmla z21.s, p3/M, z2.s, z12.s\n"
"fmla z24.s, p3/M, z5.s, z9.s\n"
"fmla z25.s, p3/M, z5.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x14, x4, LSL #2]\n"
"fmla z26.s, p3/M, z4.s, z12.s\n"
"fmla z27.s, p3/M, z3.s, z12.s\n"
"fmla z22.s, p3/M, z1.s, z12.s\n"
"fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x17, LSL #2]\n"
"fmla z16.s, p3/M, z7.s, z10.s\n"
"fmla z17.s, p3/M, z6.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x11, x17, LSL #2]\n"
"fmla z20.s, p3/M, z4.s, z9.s\n"
"fmla z21.s, p3/M, z3.s, z9.s\n"
"fmla z28.s, p3/M, z1.s, z9.s\n"
"fmla z29.s, p3/M, z0.s, z9.s\n"
"fmla z24.s, p3/M, z7.s, z9.s\n"
"fmla z25.s, p3/M, z6.s, z9.s\n"
- "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x5, x7, LSL #2]\n"
"fmla z18.s, p3/M, z8.s, z11.s\n"
"fmla z19.s, p3/M, z7.s, z11.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x13, x4, LSL #2]\n"
"fmla z26.s, p3/M, z8.s, z12.s\n"
"fmla z27.s, p3/M, z7.s, z12.s\n"
"fmla z22.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
"fmla z30.s, p3/M, z2.s, z12.s\n"
"fmla z31.s, p3/M, z1.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x5, x8, LSL #2]\n"
"fmla z20.s, p3/M, z7.s, z14.s\n"
"fmla z21.s, p3/M, z6.s, z14.s\n"
"fmla z28.s, p3/M, z4.s, z14.s\n"
"fmla z29.s, p3/M, z3.s, z14.s\n"
"fmla z16.s, p3/M, z1.s, z14.s\n"
"fmla z17.s, p3/M, z0.s, z14.s\n"
- "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x13, x17, LSL #2]\n"
"fmla z24.s, p3/M, z2.s, z10.s\n"
"fmla z25.s, p3/M, z1.s, z10.s\n"
"fmla z26.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x7]\n"
- "fmla z18.s, p3/M, z2.s, z14.s\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
"fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z20.s, p3/M, z3.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z14.s\n"
"fmla z22.s, p3/M, z8.s, z14.s\n"
"fmla z23.s, p3/M, z7.s, z14.s\n"
"fmla z30.s, p3/M, z5.s, z14.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z31.s, p3/M, z4.s, z14.s\n"
"fmla z19.s, p3/M, z1.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x7, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z9.s\n"
"fmla z26.s, p3/M, z1.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x15, LSL #2]\n"
"fmla z24.s, p3/M, z6.s, z10.s\n"
- "ld1w { z14.s }, p2/Z, [x16]\n"
+ "ld1w { z14.s }, p2/Z, [x13]\n"
"fmla z17.s, p3/M, z4.s, z11.s\n"
"fmla z18.s, p3/M, z3.s, z11.s\n"
"fmla z27.s, p3/M, z8.s, z12.s\n"
"fmla z23.s, p3/M, z5.s, z12.s\n"
"fmla z31.s, p3/M, z2.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
"fmla z20.s, p3/M, z6.s, z14.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x15, LSL #2]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z16.s, p3/M, z0.s, z14.s\n"
- "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
"fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z23.s, p3/M, z8.s, z9.s\n"
"fmla z17.s, p3/M, z7.s, z12.s\n"
"fmla z18.s, p3/M, z6.s, z12.s\n"
"fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "fmla z30.s, p3/M, z6.s, z11.s\n"
"fmla z16.s, p3/M, z5.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
"fmla z31.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z14.s }, p2/Z, [x11, x8, LSL #2]\n"
"fmla z17.s, p3/M, z5.s, z10.s\n"
"fmla z18.s, p3/M, z4.s, z10.s\n"
"fmla z19.s, p3/M, z3.s, z10.s\n"
- "fmla z23.s, p3/M, z8.s, z9.s\n"
- "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
"fmla z16.s, p3/M, z8.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x16, x4, LSL #2]\n"
"fmla z29.s, p3/M, z8.s, z10.s\n"
"fmla z30.s, p3/M, z7.s, z10.s\n"
"fmla z31.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x17, LSL #2]\n"
"fmla z17.s, p3/M, z8.s, z14.s\n"
- "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
"fmla z18.s, p3/M, z7.s, z14.s\n"
"fmla z19.s, p3/M, z6.s, z14.s\n"
- "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
"fmla z24.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
"fmla z25.s, p3/M, z3.s, z9.s\n"
"fmla z20.s, p3/M, z1.s, z9.s\n"
"fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n"
"fmla z26.s, p3/M, z5.s, z11.s\n"
"fmla z27.s, p3/M, z4.s, z11.s\n"
"fmla z22.s, p3/M, z2.s, z11.s\n"
@@ -640,24 +640,24 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
"fmla z19.s, p3/M, z4.s, z12.s\n"
".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
- "st1w { z24.s }, p0, [x28]\n"
".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
"st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
- "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
- "st1w { z20.s }, p0, [x27]\n"
- "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
- "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
- "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
- "st1w { z28.s }, p0, [x25]\n"
- "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
- "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
- "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z26.s }, p0, [x28, x27, LSL #2]\n"
+ "st1w { z27.s }, p0, [x28, x26, LSL #2]\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "st1w { z21.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x25, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x25, x26, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x24, x26, LSL #2]\n"
"st1w { z16.s }, p0, [x23]\n"
"st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
- "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
- "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z18.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z19.s }, p0, [x23, x26, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 2e2a45bab0..44bfbf4849 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -101,540 +101,540 @@ void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x16, #0x0\n"
"ptrue p3.b\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z13.s }, p3/Z, [x8]\n"
- "addvl x8, x8, #1\n"
"ldp x23, x22, [x17, #0x0]\n"
"ldp x21, x20, [x17, #0x10]\n"
- "cntw x16\n"
+ "cntw x15\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "cmp x15, %x[n_channels]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
"addvl x8, x8, #4\n"
- "mov x15, #0x0\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
+ "sub x13, XZR, x15\n"
".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
"addvl x8, x8, #4\n"
- "cmp x16, %x[n_channels]\n"
- "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x13, XZR, x16\n"
"ld1w { z8.s }, p3/Z, [x8]\n"
"addvl x8, x8, #1\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z4.s, z9.s\n"
"movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
"ldr x24, [x17, #0x20]\n"
"incw x13\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z3.s, z9.s\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z1.s, z9.s\n"
"ldr x20, [x17, #0x30]\n"
"mov p1.b, p2.b\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z0.s, z9.s\n"
"ldr x21, [x17, #0x28]\n"
"movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
- "whilelt p0.s, x16, %x[n_channels]\n"
+ "whilelt p0.s, x15, %x[n_channels]\n"
"movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z5.s, z9.s\n"
"ldr x23, [x17, #0x38]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x22, [x17, #0x40]\n"
"fmla z16.s, p3/M, z0.s, z10.s\n"
"movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
- "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x20, [x17, #0x48]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x27, [x17, #0x50]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
"fmla z17.s, p3/M, z8.s, z12.s\n"
"ldr x26, [x17, #0x60]\n"
"fmla z18.s, p3/M, z7.s, z12.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x25, [x17, #0x68]\n"
- "fmla z29.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
"fmla z19.s, p3/M, z6.s, z12.s\n"
"ldr x21, [x17, #0x58]\n"
- "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z3.s, z12.s\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x24, [x17, #0x70]\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z22.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
"ldr x23, [x17, #0x78]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
"ldr x22, [x17, #0x80]\n"
"movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
"movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
"ldr x20, [x17, #0x88]\n"
"ld1w { z13.s }, p3/Z, [x8]\n"
- "fmla z28.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
"ldr x12, [x14, #0x0]\n"
"addvl x8, x8, #1\n"
"fmla z20.s, p3/M, z2.s, z9.s\n"
- "fmla z16.s, p3/M, z1.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x16, LSL #2]\n"
"ldr x27, [x17, #0x90]\n"
- "fmla z17.s, p3/M, z0.s, z10.s\n"
+ "fmla z17.s, p3/M, z0.s, z11.s\n"
"fmla z18.s, p3/M, z2.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x21, [x17, #0x98]\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z25.s, p3/M, z8.s, z10.s\n"
"fmla z19.s, p3/M, z1.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x16, LSL #2]\n"
"ldr x26, [x17, #0xa0]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z10.s\n"
+ "fmla z27.s, p3/M, z6.s, z10.s\n"
"ldr x11, [x14, #0x8]\n"
- "fmla z25.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
"ldr x10, [x14, #0x10]\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
- "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z10.s\n"
"ldr x9, [x14, #0x18]\n"
- "fmla z22.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z1.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x16, LSL #2]\n"
"ldr x25, [x17, #0xa8]\n"
"fmla z16.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
"fmla z20.s, p3/M, z3.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x24, [x17, #0xb0]\n"
- "fmla z17.s, p3/M, z4.s, z11.s\n"
- "fmla z18.s, p3/M, z3.s, z11.s\n"
- "fmla z29.s, p3/M, z1.s, z11.s\n"
- "fmla z19.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z10.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x23, [x17, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x16, LSL #2]\n"
"ldr x22, [x17, #0xc0]\n"
- "fmla z16.s, p3/M, z5.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x20, [x17, #0xc8]\n"
- "fmla z17.s, p3/M, z5.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z19.s, p3/M, z3.s, z12.s\n"
- "fmla z30.s, p3/M, z1.s, z12.s\n"
- "fmla z31.s, p3/M, z0.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "fmla z26.s, p3/M, z1.s, z11.s\n"
+ "fmla z27.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x28, [x17, #0xd8]\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "fmla z21.s, p3/M, z6.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z9.s\n"
+ "fmla z21.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x16, LSL #2]\n"
"ldr x21, [x17, #0xd0]\n"
- "fmla z16.s, p3/M, z7.s, z11.s\n"
- "fmla z17.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z4.s, z11.s\n"
- "fmla z29.s, p3/M, z3.s, z11.s\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z25.s, p3/M, z0.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z12.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "fmla z28.s, p3/M, z1.s, z12.s\n"
+ "fmla z29.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x26, x16, LSL #2]\n"
"ldr x27, [x17, #0xe0]\n"
- "fmla z18.s, p3/M, z8.s, z9.s\n"
- "fmla z22.s, p3/M, z8.s, z10.s\n"
- "fmla z23.s, p3/M, z7.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z9.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z8.s, z9.s\n"
+ "fmla z23.s, p3/M, z7.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
"ldr x26, [x17, #0xe8]\n"
- "fmla z19.s, p3/M, z7.s, z9.s\n"
- "fmla z30.s, p3/M, z5.s, z9.s\n"
- "fmla z31.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x25, [x17, #0xf0]\n"
- "fmla z16.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z1.s, z11.s\n"
- "fmla z18.s, p3/M, z0.s, z11.s\n"
- "fmla z28.s, p3/M, z7.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z1.s, z10.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x24, [x17, #0xf8]\n"
- "fmla z29.s, p3/M, z6.s, z10.s\n"
- "fmla z24.s, p3/M, z4.s, z10.s\n"
- "fmla z25.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
- "ldr x23, [x17, #0x100]\n"
- "fmla z22.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z2.s, z9.s\n"
- "fmla z18.s, p3/M, z1.s, z9.s\n"
- "fmla z19.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "fmla z21.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "ldr x22, [x17, #0x100]\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x20, [x17, #0x108]\n"
- "fmla z16.s, p3/M, z6.s, z11.s\n"
- "fmla z28.s, p3/M, z3.s, z11.s\n"
- "fmla z24.s, p3/M, z0.s, z11.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ldr x22, [x17, #0x110]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z16.s, p3/M, z6.s, z9.s\n"
+ "fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z26.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x23, [x17, #0x110]\n"
+ "fmla z27.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
"fmla z23.s, p3/M, z1.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
"ldr x21, [x17, #0x118]\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
"fmla z20.s, p3/M, z0.s, z11.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x16, LSL #2]\n"
"fmla z21.s, p3/M, z4.s, z10.s\n"
"fmla z22.s, p3/M, z3.s, z10.s\n"
- "fmla z19.s, p3/M, z8.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z9.s\n"
- "fmla z28.s, p3/M, z6.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
"fmla z20.s, p3/M, z5.s, z10.s\n"
- "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
"fmla z23.s, p3/M, z2.s, z9.s\n"
- "fmla z21.s, p3/M, z7.s, z12.s\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z8.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z21.s, p3/M, z5.s, z11.s\n"
- "fmla z22.s, p3/M, z4.s, z11.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldp x20, x25, [x17, #0x0]\n"
- "fmla z31.s, p3/M, z8.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z11.s\n"
+ "fmla z20.s, p3/M, z8.s, z11.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "fmla z23.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldp x20, x22, [x17, #0x0]\n"
"fmla z16.s, p3/M, z4.s, z10.s\n"
"fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z5.s, z11.s\n"
- "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
- "fmla z19.s, p3/M, z4.s, z11.s\n"
- "fmla z21.s, p3/M, z8.s, z12.s\n"
- "fmla z22.s, p3/M, z7.s, z12.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ldp x20, x24, [x17, #0x10]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
- "incw x15\n"
- "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z9.s }, p0/Z, [x20, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z11.s\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "incw x16\n"
".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "whilelt p2.s, x16, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z7.s, z0.s\n"
+ "fmla z29.s, p3/M, z6.s, z0.s\n"
+ "ld1w { z11.s }, p0/Z, [x21, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z0.s\n"
+ "fmla z21.s, p3/M, z3.s, z0.s\n"
+ "ld1w { z12.s }, p0/Z, [x20, x15, LSL #2]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
"st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
"ldr x23, [x14, #0x20]\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z10.s }, p0/Z, [x22, x15, LSL #2]\n"
"st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
"ldr x22, [x14, #0x28]\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "fmla z26.s, p3/M, z8.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "incw x15\n"
"st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
"ldr x21, [x14, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z0.s\n"
- ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
"st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
"ldr x20, [x14, #0x38]\n"
- "fmla z20.s, p3/M, z4.s, z12.s\n"
- "fmla z21.s, p3/M, z3.s, z12.s\n"
- "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
"ldr x23, [x14, #0x40]\n"
- "fmla z22.s, p3/M, z5.s, z0.s\n"
- "fmla z23.s, p3/M, z4.s, z0.s\n"
- "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "addvl x8, x8, #1\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
"ldr x22, [x14, #0x48]\n"
- ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
- "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
- "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
"ldr x21, [x14, #0x50]\n"
- "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
- "incw x16\n"
- "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
"ldr x20, [x14, #0x58]\n"
- ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
- "addvl x8, x8, #4\n"
- "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
"ldr x23, [x14, #0x60]\n"
- "whilelt p2.s, x15, %x[n_channels]\n"
- ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
- "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
"ldr x22, [x14, #0x68]\n"
- "addvl x8, x8, #4\n"
- "cmp x16, %x[n_channels]\n"
- "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
"ldr x21, [x14, #0x70]\n"
- ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
- "ld1w { z8.s }, p3/Z, [x8]\n"
- "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
"ldr x20, [x14, #0x78]\n"
- "addvl x8, x8, #1\n"
"st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
"st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
"st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
"st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
- "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z8.s, z9.s\n"
"ldr x24, [x17, #0x20]\n"
"incw x13\n"
- "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z29, z13\n fmla z29.s, p3/M, z1.s, z9.s\n"
"ldr x20, [x17, #0x30]\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z0.s, z9.s\n"
"ldr x23, [x17, #0x28]\n"
- "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
- "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z7.s, z9.s\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z5.s, z9.s\n"
"ldr x22, [x17, #0x38]\n"
- "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
- "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x21, [x17, #0x40]\n"
- "fmla z20.s, p3/M, z0.s, z10.s\n"
- "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x20, [x17, #0x48]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z25.s, p3/M, z2.s, z12.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x27, [x17, #0x50]\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
"ldr x26, [x17, #0x60]\n"
- "fmla z22.s, p3/M, z7.s, z12.s\n"
- "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
- "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x25, [x17, #0x68]\n"
- "fmla z29.s, p3/M, z7.s, z9.s\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
"ldr x20, [x17, #0x58]\n"
- "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
- "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
"ldr x24, [x17, #0x70]\n"
- "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z18.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x23, [x17, #0x78]\n"
- "fmla z25.s, p3/M, z4.s, z9.s\n"
- "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
"ldr x22, [x17, #0x80]\n"
"movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
"movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
"ldr x21, [x17, #0x88]\n"
- "fmla z28.s, p3/M, z8.s, z9.s\n"
- "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
"ldr x12, [x14, #0x0]\n"
"fmla z16.s, p3/M, z2.s, z9.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x27, x16, LSL #2]\n"
"ldr x27, [x17, #0x90]\n"
- "fmla z21.s, p3/M, z0.s, z12.s\n"
- "fmla z22.s, p3/M, z2.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x20, [x17, #0x98]\n"
- "fmla z29.s, p3/M, z8.s, z11.s\n"
- "fmla z23.s, p3/M, z1.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z11.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x16, LSL #2]\n"
"ldr x26, [x17, #0xa0]\n"
- "fmla z30.s, p3/M, z7.s, z11.s\n"
- "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
"ldr x11, [x14, #0x8]\n"
- "fmla z25.s, p3/M, z5.s, z11.s\n"
- "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
"ldr x10, [x14, #0x10]\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
"fmla z17.s, p3/M, z2.s, z11.s\n"
"ldr x9, [x14, #0x18]\n"
"fmla z18.s, p3/M, z1.s, z11.s\n"
"fmla z19.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x16, LSL #2]\n"
"ldr x25, [x17, #0xa8]\n"
- "fmla z20.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z9.s\n"
- "fmla z24.s, p3/M, z6.s, z12.s\n"
- "fmla z16.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z9.s\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x24, [x17, #0xb0]\n"
- "fmla z21.s, p3/M, z4.s, z10.s\n"
- "fmla z22.s, p3/M, z3.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
- "fmla z23.s, p3/M, z5.s, z13.s\n"
- "fmla z31.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z0.s, z10.s\n"
- "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
+ "fmla z26.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z1.s, z11.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z10.s\n"
+ "fmla z22.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x23, [x17, #0xb8]\n"
- "fmla z27.s, p3/M, z8.s, z12.s\n"
- "fmla z19.s, p3/M, z5.s, z12.s\n"
- "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "fmla z19.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x16, LSL #2]\n"
"ldr x22, [x17, #0xc0]\n"
- "fmla z20.s, p3/M, z5.s, z10.s\n"
- "fmla z28.s, p3/M, z2.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "fmla z20.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x21, [x17, #0xc8]\n"
- "fmla z21.s, p3/M, z5.s, z13.s\n"
- "fmla z22.s, p3/M, z4.s, z13.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "fmla z23.s, p3/M, z3.s, z13.s\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "fmla z31.s, p3/M, z0.s, z13.s\n"
- "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z22.s, p3/M, z1.s, z10.s\n"
+ "fmla z23.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x28, [x17, #0xd8]\n"
"fmla z16.s, p3/M, z7.s, z9.s\n"
"fmla z17.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x27, x16, LSL #2]\n"
"ldr x20, [x17, #0xd0]\n"
- "fmla z20.s, p3/M, z7.s, z12.s\n"
- "fmla z21.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z29.s, p3/M, z3.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z12.s\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z25.s, p3/M, z6.s, z11.s\n"
+ "fmla z20.s, p3/M, z4.s, z11.s\n"
+ "fmla z21.s, p3/M, z3.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x16, LSL #2]\n"
"ldr x27, [x17, #0xe0]\n"
- "fmla z22.s, p3/M, z8.s, z10.s\n"
- "fmla z18.s, p3/M, z8.s, z11.s\n"
- "fmla z19.s, p3/M, z7.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z13.s\n"
+ "fmla z19.s, p3/M, z7.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
"ldr x26, [x17, #0xe8]\n"
- "fmla z23.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z26.s, p3/M, z2.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
"ldr x25, [x17, #0xf0]\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "fmla z21.s, p3/M, z1.s, z12.s\n"
- "fmla z22.s, p3/M, z0.s, z12.s\n"
- "fmla z28.s, p3/M, z7.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x16, LSL #2]\n"
"ldr x24, [x17, #0xf8]\n"
- "fmla z29.s, p3/M, z6.s, z11.s\n"
- "fmla z24.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmla z16.s, p3/M, z1.s, z11.s\n"
- "fmla z17.s, p3/M, z0.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z21.s, p3/M, z6.s, z10.s\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z16.s, p3/M, z1.s, z10.s\n"
+ "fmla z17.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
"ldr x23, [x17, #0x100]\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z2.s, z9.s\n"
- "fmla z22.s, p3/M, z1.s, z9.s\n"
- "fmla z23.s, p3/M, z0.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
"ldr x22, [x17, #0x108]\n"
- "fmla z20.s, p3/M, z6.s, z12.s\n"
- "fmla z28.s, p3/M, z3.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z12.s\n"
- "fmla z30.s, p3/M, z8.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z13.s\n"
+ "fmla z20.s, p3/M, z3.s, z13.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z2.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "fmla z22.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x16, LSL #2]\n"
"ldr x21, [x17, #0x110]\n"
- "fmla z31.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z5.s, z10.s\n"
- "fmla z19.s, p3/M, z1.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z9.s\n"
+ "fmla z19.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
"ldr x20, [x17, #0x118]\n"
- "fmla z16.s, p3/M, z0.s, z12.s\n"
- "fmla z17.s, p3/M, z4.s, z9.s\n"
- "fmla z18.s, p3/M, z3.s, z9.s\n"
- "fmla z23.s, p3/M, z8.s, z11.s\n"
- "fmla z31.s, p3/M, z5.s, z11.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z25.s, p3/M, z7.s, z9.s\n"
- "fmla z26.s, p3/M, z6.s, z9.s\n"
- "fmla z16.s, p3/M, z5.s, z9.s\n"
- "fmla z27.s, p3/M, z5.s, z10.s\n"
- "fmla z19.s, p3/M, z2.s, z10.s\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z18.s, p3/M, z6.s, z12.s\n"
- "fmla z24.s, p3/M, z8.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z16.s, p3/M, z8.s, z12.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z7.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z11.s\n"
- "fmla z17.s, p3/M, z5.s, z11.s\n"
- "fmla z18.s, p3/M, z4.s, z11.s\n"
- "fmla z19.s, p3/M, z3.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z8.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
- "fmla z20.s, p3/M, z4.s, z9.s\n"
- "fmla z21.s, p3/M, z3.s, z9.s\n"
- "fmla z22.s, p3/M, z5.s, z11.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z17.s, p3/M, z8.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z12.s\n"
- "fmla z19.s, p3/M, z6.s, z12.s\n"
- "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z1.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z9.s\n"
- "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z2.s, z11.s\n"
- "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z16.s, p3/M, z0.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z13.s\n"
+ "fmla z19.s, p3/M, z2.s, z13.s\n"
+ "fmla z17.s, p3/M, z7.s, z9.s\n"
+ "fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z16.s, p3/M, z8.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z8.s, z13.s\n"
+ "ld1w { z13.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z19.s, p3/M, z3.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z3.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "fmla z21.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z8.s, z13.s\n"
+ "fmla z18.s, p3/M, z7.s, z13.s\n"
+ "fmla z19.s, p3/M, z6.s, z13.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z13.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z9.s\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ "fmla z31.s, p3/M, z7.s, z9.s\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "fmla z16.s, p3/M, z4.s, z13.s\n"
+ "fmla z17.s, p3/M, z3.s, z13.s\n"
".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
- "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+ "fmla z18.s, p3/M, z5.s, z9.s\n"
+ "st1w { z24.s }, p0, [x12, x13, LSL #2]\n"
"ldr x23, [x14, #0x20]\n"
- "fmla z24.s, p3/M, z7.s, z13.s\n"
- "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+ "fmla z19.s, p3/M, z4.s, z9.s\n"
+ "st1w { z25.s }, p0, [x11, x13, LSL #2]\n"
"ldr x22, [x14, #0x28]\n"
- "fmla z25.s, p3/M, z6.s, z13.s\n"
- "fmla z26.s, p3/M, z8.s, z0.s\n"
- "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
- "ldr x21, [x14, #0x30]\n"
- "fmla z27.s, p3/M, z7.s, z0.s\n"
".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
- "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "st1w { z26.s }, p0, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "st1w { z27.s }, p0, [x9, x13, LSL #2]\n"
"ldr x20, [x14, #0x38]\n"
- "fmla z16.s, p3/M, z4.s, z13.s\n"
- "fmla z17.s, p3/M, z3.s, z13.s\n"
- "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z20.s }, p0, [x23, x13, LSL #2]\n"
"ldr x23, [x14, #0x40]\n"
- "fmla z18.s, p3/M, z5.s, z0.s\n"
- "fmla z19.s, p3/M, z4.s, z0.s\n"
- "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
- "ldr x22, [x14, #0x48]\n"
- ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
- "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z21.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "st1w { z22.s }, p0, [x21, x13, LSL #2]\n"
"ldr x21, [x14, #0x50]\n"
- "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ "st1w { z23.s }, p0, [x20, x13, LSL #2]\n"
"ldr x20, [x14, #0x58]\n"
- "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
"ldr x23, [x14, #0x60]\n"
- "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
"ldr x22, [x14, #0x68]\n"
- "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
"ldr x21, [x14, #0x70]\n"
- "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
"ldr x20, [x14, #0x78]\n"
"st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
"st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 066b935486..131a8eec01 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,78 +88,78 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ptrue p3.b\n"
- ".inst 0x25207810 // ptrue pn8.b\n"
"mov x2, #0x0\n"
"mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
"1:" // Tile loop
"str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
"mov x22, #0x4\n"
"str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
"ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
"madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "add x7, x4, x4\n"
"mul x20, x20, x22\n" // offset *= kernel_stride * output_size
- "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x8, x7, x4\n"
"add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x6, x5, x21, LSL #2\n"
- "add x7, x6, x21, LSL #2\n"
- "add x8, x4, x4\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x16, x7, x21, LSL #2\n"
- "add x15, x8, x4\n"
- "add x14, x16, x21, LSL #2\n"
- "add x13, x15, x4\n"
+ "add x17, x8, x4\n"
+ "add x16, x5, x21, LSL #2\n"
+ "add x15, x16, x21, LSL #2\n"
+ "add x14, x15, x21, LSL #2\n"
+ "add x13, x14, x21, LSL #2\n"
"cbnz x3, 2f\n"
"ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "sub x21, x20, x3\n"
- "sub x21, x21, #0x1\n"
"lsl x12, %x[n_channels], #0x2\n"
- "mov x20, #0x10\n"
- "and x21, x21, #0x3fffff\n"
- "mul x20, x20, x4\n"
- "orr x12, x12, x21, LSL #22\n"
- "orr x12, x12, x20, LSL #38\n"
- "add x27, x7, x8, LSL #2\n"
+ "mov x28, #0x10\n"
+ "mul x28, x28, x4\n"
+ "add x27, x15, x7, LSL #2\n"
"add x26, x5, x4, LSL #2\n"
- "add x25, x5, x15, LSL #2\n"
- "add x24, x5, x13, LSL #2\n"
- "add x23, x6, x4, LSL #2\n"
- "add x22, x5, x8, LSL #2\n"
- "add x21, x6, x15, LSL #2\n"
- "add x20, x6, x13, LSL #2\n"
- "add x11, x6, x8, LSL #2\n"
- "add x10, x16, x4, LSL #2\n"
- "add x9, x7, x4, LSL #2\n"
- "add x28, x16, x15, LSL #2\n"
+ "add x25, x5, x8, LSL #2\n"
+ "sub x20, x20, x3\n"
+ "add x24, x5, x17, LSL #2\n"
+ "sub x20, x20, #0x1\n"
+ "add x23, x16, x4, LSL #2\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x22, x5, x7, LSL #2\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x21, x16, x8, LSL #2\n"
+ "orr x12, x12, x28, LSL #38\n"
+ "add x20, x16, x17, LSL #2\n"
+ "add x11, x16, x7, LSL #2\n"
+ "add x10, x14, x4, LSL #2\n"
+ "add x9, x15, x4, LSL #2\n"
+ "add x28, x14, x8, LSL #2\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
- "add x27, x7, x15, LSL #2\n"
+ "add x27, x15, x8, LSL #2\n"
".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
- "add x26, x16, x13, LSL #2\n"
+ "add x26, x14, x17, LSL #2\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
- "add x25, x7, x13, LSL #2\n"
+ "add x25, x15, x17, LSL #2\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
- "add x24, x14, x4, LSL #2\n"
- ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
+ "add x24, x13, x4, LSL #2\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
- "add x23, x16, x8, LSL #2\n"
+ "add x23, x14, x7, LSL #2\n"
".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
- "add x22, x14, x15, LSL #2\n"
+ "add x22, x13, x8, LSL #2\n"
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
- "add x21, x14, x8, LSL #2\n"
+ "add x21, x13, x7, LSL #2\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
- "add x20, x14, x13, LSL #2\n"
+ "add x20, x13, x17, LSL #2\n"
".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
- ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
- ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac49fa // rprfm pldonce, x12, [x15]\n"
".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
- ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
@@ -167,199 +167,199 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
"2:" // Tile loop: Prefetch input rows: End
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"mov x20, #0x2\n"
- "ld1w { z22.s }, p3/Z, [x17]\n"
- "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "madd x21, x3, x25, x21\n" // offset += tile_j * ld_output_col
- "addvl x17, x17, #1\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "ld1w { z28.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x25\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
"ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "mul x21, x21, x20\n" // offset *= output_tile_size
- "cntw x23\n"
- "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "addvl x17, x17, #4\n"
- "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "addvl x17, x17, #4\n"
- "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x23, %x[n_channels]\n"
- "add x22, x24, x22, LSL #2\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x22, x2, x23\n" // offset = tile_i * ld_output_row
+ "cmp x25, %x[n_channels]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x3, x26, x22\n" // offset += tile_j * ld_output_col
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"mov x21, #0x0\n"
- "sub x20, XZR, x23\n"
- "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+ "mul x22, x22, x20\n" // offset *= output_tile_size
+ "sub x20, XZR, x25\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "add x24, x24, x22, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z9.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "addvl x6, x6, #1\n"
+ "add x23, x24, x23, LSL #2\n"
"ld1w { z10.s }, p2/Z, [x5]\n"
- "addvl x17, x17, #1\n"
"ld1w { z11.s }, p2/Z, [x5, x4, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x5, x13, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x6]\n"
- "ld1w { z15.s }, p2/Z, [x6, x4, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x5, x17, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x16]\n"
+ "ld1w { z15.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x5, x7, LSL #2]\n"
"bge 4f\n"
"3:" // Tile loop: Channel loop
- "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "whilelt p1.s, x23, %x[n_channels]\n"
+ "movprfx z24, z28\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z25, z28\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "whilelt p1.s, x25, %x[n_channels]\n"
"incw x21\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
- "incw x23\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "movprfx z26, z28\n fmla z26.s, p3/M, z2.s, z9.s\n"
+ "movprfx z27, z28\n fmla z27.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z28.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "incw x25\n"
"mov p0.b, p2.b\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
"addvl x5, x5, #1\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z27.s\n"
- "ld1w { z25.s }, p2/Z, [x16]\n"
- "addvl x6, x6, #1\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z18.s\n"
- "ld1w { z12.s }, p2/Z, [x7]\n"
"incw x20\n"
- "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "fmla z29.s, p3/M, z3.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
- "fmla z30.s, p3/M, z3.s, z25.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z12.s\n"
- "fmla z31.s, p3/M, z1.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "addvl x7, x7, #1\n"
- "fmla z31.s, p3/M, z2.s, z22.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14]\n"
- "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x17, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x5]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x16, x7, LSL #2]\n"
"addvl x16, x16, #1\n"
- "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z29.s, p3/M, z7.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z22.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z16.s\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z22.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
- "cmp x23, %x[n_channels]\n"
- ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
- ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z25.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x15]\n"
+ "ld1w { z17.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, x8, LSL #2]\n"
+ "ld1w { z0.s }, p2/Z, [x15, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, x17, LSL #2]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z24.s, p3/M, z5.s, z22.s\n"
+ "fmla z25.s, p3/M, z3.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x15, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z1.s, z0.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x13]\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z0.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z20.s\n"
"addvl x14, x14, #1\n"
- "st1w { z28.s }, p0, [x24]\n"
- ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
- "addvl x17, x17, #4\n"
- "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
- "addvl x24, x24, #1\n"
- "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "ld1w { z10.s }, p1/Z, [x5]\n"
- "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
- "addvl x22, x22, #1\n"
+ "ld1w { z12.s }, p1/Z, [x5, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z13.s }, p1/Z, [x5, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z20.s\n"
+ "fmla z26.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z27.s, p3/M, z3.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ ".inst 0xa040c0c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ld1w { z16.s }, p1/Z, [x5, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z14.s }, p1/Z, [x16]\n"
+ "fmla z27.s, p3/M, z6.s, z19.s\n"
+ ".inst 0xa040c0c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ld1w { z15.s }, p1/Z, [x16, x4, LSL #2]\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
"ld1w { z11.s }, p1/Z, [x5, x4, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x5, x15, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x5, x13, LSL #2]\n"
- "ld1w { z14.s }, p1/Z, [x6]\n"
- "ld1w { z15.s }, p1/Z, [x6, x4, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x5, x8, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17]\n"
- "addvl x17, x17, #1\n"
+ "ld1w { z8.s }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ ".inst 0xc1bfcbd8 // fclamp { z24.s-z27.s }, z30.s, z31.s\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x24, x26, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z26.s }, p0, [x23]\n"
+ "st1w { z27.s }, p0, [x23, x26, LSL #2]\n"
+ "addvl x23, x23, #1\n"
"blt 3b\n"
"4:" // Tile loop: Channel tail
- "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z28\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z25, z28\n fmla z25.s, p3/M, z6.s, z9.s\n"
"ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "add x3, x3, #0x1\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z28\n fmla z26.s, p3/M, z2.s, z9.s\n"
+ "movprfx z27, z28\n fmla z27.s, p3/M, z0.s, z9.s\n"
"ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "fmla z28.s, p3/M, z1.s, z11.s\n"
- "fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "fmla z28.s, p3/M, z3.s, z14.s\n"
- "fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
"ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x16]\n"
- "cmp x3, x20\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x7]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x16, x17, LSL #2]\n"
"add x20, x2, #0x1\n"
- "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "cmp x3, x22\n"
"csel x2, x2, x20, LT\n"
- "fmla z28.s, p3/M, z5.s, z20.s\n"
- "fmla z29.s, p3/M, z3.s, z20.s\n"
- "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
- "mov p0.b, p2.b\n"
- "fmla z30.s, p3/M, z3.s, z17.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
"csel x3, x3, XZR, LT\n"
- "fmla z30.s, p3/M, z0.s, z18.s\n"
- "fmla z31.s, p3/M, z1.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
"cmp x2, x21\n"
- "fmla z30.s, p3/M, z4.s, z17.s\n"
- "fmla z31.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "fmla z31.s, p3/M, z2.s, z18.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14]\n"
- "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
- "fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z31.s, p3/M, z3.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z29.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z16.s\n"
- ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
- "st1w { z28.s }, p0, [x24]\n"
- "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
- "st1w { z30.s }, p0, [x22]\n"
- "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x16, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z25.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z23.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z25.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ "ld1w { z19.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z22.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, x8, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x15, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x15, x17, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z20.s\n"
+ "fmla z25.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x13, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x13]\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, x7, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z21.s\n"
+ "fmla z27.s, p3/M, z5.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x8, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z20.s\n"
+ "fmla z27.s, p3/M, z2.s, z21.s\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z17.s\n"
+ "fmla z27.s, p3/M, z6.s, z19.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ ".inst 0xc1bfcbd8 // fclamp { z24.s-z27.s }, z30.s, z31.s\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x24, x26, LSL #2]\n"
+ "st1w { z26.s }, p0, [x23]\n"
+ "st1w { z27.s }, p0, [x23, x26, LSL #2]\n"
"blt 1b\n"
".inst 0xd503467f // SMSTOP\n"
:
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index dc7a40ff54..7ca4cafbe6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,221 +90,221 @@ void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
"ptrue p3.b\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
".inst 0x25207810 // ptrue pn8.b\n"
- "ld1w { z26.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
- "ldp x14, x13, [x20, #0x0]\n"
- "cntw x12\n"
- ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
- "addvl x15, x15, #4\n"
- "ldp x11, x10, [x20, #0x10]\n"
- "mov x9, #0x0\n"
+ "cntw x13\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- "ldp x28, x26, [x16, #0x0]\n"
- "addvl x15, x15, #4\n"
- "cmp x12, %x[n_channels]\n"
- "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1rw { z27.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x10, XZR, x13\n"
+ "ldp x9, x28, [x20, #0x10]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
"ldp x25, x24, [x16, #0x10]\n"
- "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "sub x27, XZR, x12\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
"ldp x23, x22, [x16, #0x20]\n"
- "ld1w { z8.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
"ldp x21, x20, [x16, #0x30]\n"
- "ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x21, [x16, #0x40]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x25, [x16, #0x40]\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
+ "ldr x22, [x16, #0x48]\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
+ "ldr x21, [x16, #0x50]\n"
+ "addvl x14, x14, #1\n"
+ "incw x10\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov p0.b, p2.b\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x16, #0x48]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x26, [x16, #0x68]\n"
+ "ldr x23, [x16, #0x88]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x20, [x16, #0x50]\n"
+ "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x90]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x16, #0x58]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z22.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z22.s\n"
- "ldr x21, [x16, #0x78]\n"
- "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z25.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x16, #0x60]\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
- "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x20, [x16, #0x80]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "fmla z29.s, p3/M, z3.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z23.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "ldr x20, [x16, #0x88]\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z13.s\n"
- "fmla z31.s, p3/M, z1.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x98]\n"
+ "fmla z29.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
"fmla z30.s, p3/M, z4.s, z17.s\n"
- "fmla z31.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z13.s\n"
- "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0x90]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x20, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z4.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0xa0]\n"
- "fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z31.s, p3/M, z3.s, z17.s\n"
- "ldr x20, [x16, #0xb0]\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z29.s, p3/M, z7.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
"ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z4.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldp x20, x26, [x16, #0x0]\n"
- "fmla z30.s, p3/M, z8.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z1.s, z15.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z1.s, z25.s\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z15.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z20.s\n"
+ "fmla z31.s, p3/M, z5.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z20.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x0]\n"
"ldp x25, x24, [x16, #0x10]\n"
- "ld1w { z26.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
- "incw x9\n"
+ "incw x15\n"
"ldp x23, x22, [x16, #0x20]\n"
- "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
- "incw x27\n"
- "mov p0.b, p2.b\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
"ldp x21, x20, [x16, #0x30]\n"
- "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
- ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
- "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
- "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
- "ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
- "ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
- "incw x12\n"
- "cmp x12, %x[n_channels]\n"
- ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
- "addvl x15, x15, #4\n"
- ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
- "addvl x15, x15, #4\n"
- "ld1w { z8.s }, p3/Z, [x15]\n"
- "addvl x15, x15, #1\n"
+ "ld1w { z9.s }, p1/Z, [x27, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z10.s }, p1/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x24, x13, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z14.s }, p1/Z, [x22, x13, LSL #2]\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "fmla z31.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z15.s }, p1/Z, [x21, x13, LSL #2]\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z11.s }, p1/Z, [x25, x13, LSL #2]\n"
+ "incw x13\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ ".inst 0xc1bbcb1c // fclamp { z28.s-z31.s }, z24.s, z27.s\n"
+ "st1w { z28.s }, p0, [x12, x10, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x10, LSL #2]\n"
+ "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x10, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
- "ldr x21, [x16, #0x40]\n"
- "incw x27\n"
+ "movprfx z28, z23\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x25, [x16, #0x40]\n"
+ "incw x10\n"
+ "ldr x22, [x16, #0x48]\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "mov p0.b, p2.b\n"
+ "ldr x21, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
"fmla z28.s, p3/M, z0.s, z10.s\n"
"fmla z29.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x16, #0x48]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x26, [x16, #0x68]\n"
+ "ldr x23, [x16, #0x88]\n"
"fmla z28.s, p3/M, z1.s, z11.s\n"
"fmla z29.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x20, [x16, #0x50]\n"
+ "ld1w { z18.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x25, [x16, #0x90]\n"
"fmla z28.s, p3/M, z3.s, z14.s\n"
"fmla z29.s, p3/M, z0.s, z16.s\n"
- "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x16, #0x58]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z23.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z28.s, p3/M, z4.s, z15.s\n"
- "fmla z29.s, p3/M, z4.s, z17.s\n"
- "ldr x21, [x16, #0x78]\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z30.s, p3/M, z0.s, z22.s\n"
"fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x16, #0x60]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
- "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "ldr x20, [x16, #0x80]\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x20, x15, LSL #2]\n"
"fmla z28.s, p3/M, z5.s, z20.s\n"
"fmla z29.s, p3/M, z3.s, z20.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x21, [x16, #0x68]\n"
- "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
"fmla z31.s, p3/M, z4.s, z16.s\n"
- "ldr x20, [x16, #0x88]\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z0.s, z18.s\n"
- "fmla z31.s, p3/M, z1.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0x70]\n"
- "ldr x20, [x16, #0x98]\n"
- "fmla z30.s, p3/M, z4.s, z17.s\n"
- "fmla z31.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0x90]\n"
- "fmla z30.s, p3/M, z1.s, z16.s\n"
- "ldr x20, [x16, #0xa8]\n"
- "fmla z31.s, p3/M, z2.s, z18.s\n"
- "fmla z28.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
"fmla z30.s, p3/M, z6.s, z16.s\n"
- "fmla z31.s, p3/M, z3.s, z17.s\n"
- "ldr x20, [x16, #0xb0]\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z16.s\n"
- "fmla z29.s, p3/M, z7.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z31.s, p3/M, z7.s, z16.s\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x16, #0xc0]\n"
- "fmla z31.s, p3/M, z6.s, z17.s\n"
- "fmla z29.s, p3/M, z8.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z21.s\n"
+ "fmla z31.s, p3/M, z5.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z20.s\n"
+ "fmla z31.s, p3/M, z2.s, z21.s\n"
+ "fmla z30.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z18.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z31.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z19.s\n"
"fmla z31.s, p3/M, z8.s, z16.s\n"
- "mov p0.b, p2.b\n"
- ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
- "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
- "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
- "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
- "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+ ".inst 0xc1bbcb1c // fclamp { z28.s-z31.s }, z24.s, z27.s\n"
+ "st1w { z28.s }, p0, [x12, x10, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x10, LSL #2]\n"
+ "st1w { z30.s }, p0, [x9, x10, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x10, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
index a385893146..f3906d8798 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,65 +72,65 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x7\n"
"ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z20.s, #0x0\n"
+ "fmov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
"ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x14, #0x1\n"
- "orr x24, x20, %x[ld_in_col], LSL #18\n"
- "mov z21.d, z20.d\n"
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
- "orr x24, x16, x24, LSL #20\n"
- "mov x22, #0x6\n"
+ "mov x23, #0x6\n"
+ "add x20, x17, x7\n"
+ "mov z17.d, z16.d\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
"ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
- "addvl x23, x23, #3\n"
- "add x21, x17, x7\n"
- ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "mov z22.d, z20.d\n"
- "mov z23.d, z20.d\n"
- "ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
- "addvl x23, x23, #3\n"
"mov x8, #0x0\n"
+ "sub x23, x23, x20\n"
+ "sub x20, x14, #0x1\n"
"ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
- "lsl x24, x24, #0x2\n"
- "sub x22, x22, x21\n"
- "ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x17, x13\n"
+ ".inst 0xa0404ace // ld1w { z14.s-z15.s }, pn10.b/Z, [x22]\n"
+ "orr x20, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z11.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ "orr x20, x16, x20, LSL #20\n"
+ ".inst 0xa0404acc // ld1w { z12.s-z13.s }, pn10.b/Z, [x22]\n"
+ "lsl x20, x20, #0x2\n"
+ "madd x21, x21, x17, x13\n"
+ "ld1w { z0.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ ".inst 0xa0404ac4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x22]\n"
+ "ld1w { z7.s }, p2/Z, [x22, #2, MUL VL]\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x20, x13\n"
- ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"mov x10, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x13, x17, x21, x13\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x9, x28, [x22], #0x10\n"
- ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -139,18 +139,18 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"sub x11, x11, x21\n"
- ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+ ".inst 0xc1a9c87c // fclamp { z28.s-z31.s }, z3.s, z9.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z12.s }, p1, [x9]\n"
+ "st1w { z28.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z13.s }, p1, [x28]\n"
+ "st1w { z29.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ "st1w { z30.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -162,94 +162,94 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x13]\n"
+ "ld1w { z22.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13619c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13e1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
"7:" // Unpadded: 1 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z13.s }, p1/Z, [x13]\n"
+ "ld1w { z24.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
- ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13819c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- ".inst 0xc13019c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
- ".inst 0xc13519e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
- ".inst 0xc13419e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc13e1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc13d1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z13.s\n"
+ ".inst 0xc13c1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z12.s\n"
+ ".inst 0xc1351b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z5.s\n"
+ ".inst 0xc1341b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z4.s\n"
"8:" // Unpadded: 0 priming loads
"cbz x14, 16f\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
"sub x14, x14, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x11, x11, #0x1\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"cmp x14, x11\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"csel x21, x14, x11, LT\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"sub x11, x11, x21\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc13b1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z11.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
- ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
- ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
+ ".inst 0xc13f1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
- ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
- ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc13d1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xc13c1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
- "st1w { z12.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
- "st1w { z13.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ ".inst 0xc1351ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z14.s }, p1, [x25]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1a9c87c // fclamp { z28.s-z31.s }, z3.s, z9.s\n"
+ "st1w { z28.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z29.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z30.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z15.s }, p1, [x24]\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
@@ -259,173 +259,173 @@ void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+ "mov x12, #0x4\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301980 // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc13e1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1341b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z4.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ "mov x12, #0x4\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z15.s\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1301981 // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
- ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
- ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z14.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0xc13d1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z13.s\n"
+ ".inst 0xc13c1b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z12.s\n"
+ ".inst 0xc1351b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z5.s\n"
+ ".inst 0xc1341b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z4.s\n"
"13:" // Padded: 0 priming loads
"cbz x14, 16f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z25.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x14, x14, #0x1\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x14, x11\n"
+ "ld1w { z20.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
- "sub x14, x14, #0x1\n"
- "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "cmp x14, x11\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
- "csel x21, x14, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x21\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc13b1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z11.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13f1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z15.s\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
- ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
- "ld1w { z25.s }, p0/Z, [x13]\n"
+ ".inst 0xc13e1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z20.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
- ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
- ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xc13c1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
- "st1w { z16.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
- "st1w { z17.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ ".inst 0xc1351ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z18.s }, p1, [x25]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "st1w { z19.s }, p1, [x24]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a9c87c // fclamp { z28.s-z31.s }, z3.s, z9.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "st1w { z28.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z29.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z30.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
- ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
- ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
- ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
- ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
- ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
- ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
- "st1w { z16.s }, p1, [x9]\n"
+ ".inst 0xc13b1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc13f1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc13d1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xc13c1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z12.s\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ ".inst 0xc1351ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a9c874 // fclamp { z20.s-z23.s }, z3.s, z9.s\n"
+ "st1w { z20.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
- "st1w { z17.s }, p1, [x28]\n"
+ "st1w { z21.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
- "add x8, x8, #0x1\n"
- "st1w { z18.s }, p1, [x25]\n"
+ "st1w { z22.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z19.s }, p1, [x24]\n"
+ "st1w { z23.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"16:" // Main loop skip tail
"cbz x11, 18f\n"
"17:" // Right padding loop
- ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x11, x11, #0x1\n"
- ".inst 0xc1b8c848 // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
- "st1w { z8.s }, p1, [x9]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a9c864 // fclamp { z4.s-z7.s }, z3.s, z9.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
- "st1w { z9.s }, p1, [x28]\n"
+ "st1w { z5.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z10.s }, p1, [x25]\n"
+ "st1w { z6.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z11.s }, p1, [x24]\n"
+ "st1w { z7.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
index 26315101b4..5ecfb08799 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,18 +72,18 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x7\n"
"ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
"fmov z12.s, #0x0\n"
@@ -91,46 +91,46 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
"ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x14, #0x1\n"
- "orr x24, x20, %x[ld_in_col], LSL #18\n"
+ "mov x23, #0x9\n"
+ "add x20, x17, x7\n"
"mov z13.d, z12.d\n"
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
- "orr x24, x16, x24, LSL #20\n"
- "mov x22, #0x9\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
- "addvl x23, x23, #3\n"
- "add x21, x17, x7\n"
- ".inst 0xa0404ae0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
"mov z14.d, z12.d\n"
"mov z15.d, z12.d\n"
- "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
- "addvl x23, x23, #3\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
"mov x8, #0x0\n"
+ "sub x23, x23, x20\n"
+ "sub x20, x14, #0x1\n"
"ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0xa1404ae3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
- "lsl x24, x24, #0x2\n"
- "sub x22, x22, x21\n"
- "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
- "madd x20, x20, x17, x13\n"
+ ".inst 0xa0404ac6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x22]\n"
+ "orr x20, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z10.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ "orr x20, x16, x20, LSL #20\n"
+ ".inst 0xa0404ac2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x22]\n"
+ "lsl x20, x20, #0x2\n"
+ "madd x21, x21, x17, x13\n"
+ "ld1w { z11.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ ".inst 0xa0404ac4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x22]\n"
+ "ld1w { z9.s }, p2/Z, [x22, #2, MUL VL]\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x20, x13\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
+ "mov x22, #0x2\n"
"ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x13, x17, x21, x13\n"
".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
- "mov x22, #0x2\n"
- "ldp x10, x9, [x23], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -142,9 +142,9 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
"lsr x21, x21, #0x1\n"
"sub x11, x11, x21\n"
+ ".inst 0xc1a8cb94 // fclamp { z20.s-z23.s }, z28.s, z8.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z20.s }, p1, [x10]\n"
@@ -165,136 +165,136 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x13]\n"
+ "ld1w { z21.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1321a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
- ".inst 0xc1331a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ ".inst 0xc1321bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z2.s\n"
+ ".inst 0xc1341ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
"7:" // Unpadded: 1 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x13]\n"
+ "ld1w { z21.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
- ".inst 0xc13b1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xc1331ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z3.s\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
"8:" // Unpadded: 0 priming loads
"cmp x14, #0x2\n"
"blt 16f\n"
"add x21, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
"sub x14, x14, #0x2\n"
- "ld1w { z19.s }, p1/Z, [x21]\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x11, x11, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"lsr x20, x14, #0x1\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x11\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"csel x22, x20, x11, LT\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"and x14, x14, #0x1\n"
- "ld1w { z22.s }, p1/Z, [x21]\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x11, x11, x22\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ "ld1w { z1.s }, p1/Z, [x21]\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ ".inst 0xc13a1ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z10.s\n"
"add x21, x13, %x[ld_in_row], LSL #2\n"
"subs x22, x22, #0x1\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
+ ".inst 0xc1361ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z6.s\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
- ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
+ ".inst 0xc13b1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z11.s\n"
+ ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
- ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ ".inst 0xc1391bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z9.s\n"
+ ".inst 0xc1341bc1 // fmla za.s[x8, 1], { z30.s-z1.s }, z4.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
- "st1w { z20.s }, p1, [x10]\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x21]\n"
+ ".inst 0xc1371ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z7.s\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
+ ".inst 0xc1a8cb98 // fclamp { z24.s-z27.s }, z28.s, z8.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z3.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z24.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z21.s }, p1, [x9]\n"
- "ld1w { z28.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "st1w { z25.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
- "st1w { z22.s }, p1, [x26]\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
- ".inst 0xc13b1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
+ "st1w { z26.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z23.s }, p1, [x25]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
+ "st1w { z27.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z5.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
@@ -304,323 +304,323 @@ void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"ld1w { z24.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc1321b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
"ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1361ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1331b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z22.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1371ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z7.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13b1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1351bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z5.s\n"
"13:" // Padded: 0 priming loads
"cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z25.s }, p0/Z, [x13]\n"
"add x21, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x14, x14, #0x2\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "cmp x20, x11\n"
+ "and x14, x14, #0x1\n"
+ "ld1w { z29.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x22, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
- "mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x14, x14, #0x2\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z28.s }, p0/Z, [x21]\n"
- "sub x11, x11, #0x1\n"
- "lsr x20, x14, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
- "mov x12, #0x8\n"
- "cmp x20, x11\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
- "csel x22, x20, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "and x14, x14, #0x1\n"
- "sub x11, x11, x22\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x21]\n"
"cbz x22, 15f\n"
"14:" // Padded: Main loop
- ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
+ ".inst 0xc13a1ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z10.s\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ ".inst 0xc1361ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z6.s\n"
+ "subs x22, x22, #0x1\n"
"ld1w { z18.s }, p0/Z, [x13]\n"
- "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xc13b1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z11.s\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z9.s\n"
+ ".inst 0xc1341bc1 // fmla za.s[x8, 1], { z30.s-z1.s }, z4.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0xc1a8cb98 // fclamp { z24.s-z27.s }, z28.s, z8.s\n"
+ "ld1w { z31.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
- "add x8, x8, #0x1\n"
- "st1w { z29.s }, p1, [x9]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z24.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z25.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z28.s }, p0/Z, [x21]\n"
- "st1w { z30.s }, p1, [x26]\n"
- "mov x12, #0x8\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
+ "st1w { z26.s }, p1, [x26]\n"
+ "mov x12, #0x8\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z27.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z7.s\n"
"mov x12, #0x0\n"
"ld1w { z22.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z25.s }, p0/Z, [x13]\n"
+ ".inst 0xc1331ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z3.s\n"
+ "ld1w { z29.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "add x10, x10, x28, LSL #2\n"
- "add x9, x9, x27, LSL #2\n"
- "add x26, x26, x24, LSL #2\n"
- "add x25, x25, x23, LSL #2\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0xc13a1ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z10.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1361ba1 // fmla za.s[x8, 1], { z29.s-z0.s }, z6.s\n"
+ "ld1w { z29.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xc13b1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z11.s\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ ".inst 0xc1321ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z2.s\n"
"ld1w { z22.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
- ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z9.s\n"
+ ".inst 0xc1341bc1 // fmla za.s[x8, 1], { z30.s-z1.s }, z4.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
"ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0xc1a8cb90 // fclamp { z16.s-z19.s }, z28.s, z8.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x8, x8, #0x1\n"
- "st1w { z29.s }, p1, [x9]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "st1w { z30.s }, p1, [x26]\n"
- "mov x12, #0x8\n"
- ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
+ "st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
+ "st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z18.s }, p1, [x26]\n"
+ "mov x12, #0x8\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc1371ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z7.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z3.s\n"
+ ".inst 0xc1351bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z5.s\n"
"16:" // Main loop skip tail
"cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z23.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13a1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z10.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
- "sub x11, x11, #0x1\n"
- ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
- ".inst 0xc1301b81 // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc13b1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1321a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z2.s\n"
+ ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+ ".inst 0xc1341b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z4.s\n"
".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a9c8f0 // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ ".inst 0xc1a8cb90 // fclamp { z16.s-z19.s }, z28.s, z8.s\n"
"st1w { z16.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1331ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
- "add x8, x8, #0x1\n"
"st1w { z17.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z18.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
"st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
"cbz x11, 19f\n"
"18:" // Right padding loop
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x11, x11, #0x1\n"
- ".inst 0xc1a9c8e0 // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
- "st1w { z0.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc1a8cb94 // fclamp { z20.s-z23.s }, z28.s, z8.s\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z2.s }, p1, [x26]\n"
+ "st1w { z22.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- "st1w { z3.s }, p1, [x25]\n"
+ "st1w { z23.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"bgt 18b\n"
"19:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
index 3741b973b4..d59a2e5c6a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,67 +72,67 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x6\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x17\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x7\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z28.s, #0x0\n"
+ "fmov z24.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- "mov z29.d, z28.d\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
- "orr x23, x17, x23, LSL #20\n"
"mov x22, #0x8\n"
+ "add x20, x7, x6\n"
+ "mov z25.d, z24.d\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
"ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- "addvl x14, x14, #5\n"
- "mov z30.d, z28.d\n"
- "mov z31.d, z28.d\n"
- ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"mov x8, #0x0\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
"ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x23, x23, #0x2\n"
+ "orr x20, x20, %x[ld_in_col], LSL #18\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "orr x20, x17, x20, LSL #20\n"
+ "madd x21, x21, x7, x13\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "lsl x20, x20, #0x2\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x13\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
"addvl x14, x14, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x13, x7, x20, x13\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
"mov x10, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x13, x7, x21, x13\n"
+ ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ldp x9, x28, [x22], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc0040f03 // mova za.d[x8, #3], { z24.d-z27.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
"ldp x25, x24, [x22], #0x10\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -141,18 +141,18 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"sub x11, x11, x21\n"
- ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ ".inst 0xc1bdcae0 // fclamp { z0.s-z3.s }, z23.s, z29.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z4.s }, p1, [x9]\n"
+ "st1w { z0.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z5.s }, p1, [x28]\n"
+ "st1w { z1.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z6.s }, p1, [x25]\n"
+ "st1w { z2.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z7.s }, p1, [x24]\n"
+ "st1w { z3.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -168,275 +168,275 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x13]\n"
+ "ld1w { z31.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z4.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1351be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z5.s\n"
+ ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1351a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc13a1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z10.s\n"
".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1381820 // fmla za.s[x8, 0], { z1.s-z4.s }, z8.s\n"
+ ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13c1840 // fmla za.s[x8, 0], { z2.s-z5.s }, z12.s\n"
+ ".inst 0xc1381860 // fmla za.s[x8, 0], { z3.s-z6.s }, z8.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
"7:" // Unpadded: 3 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x13]\n"
+ "ld1w { z15.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
- ".inst 0xc13e1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13b1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
- ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ ".inst 0xc13d19e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z13.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xc13519e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z5.s\n"
".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- ".inst 0xc1351ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
- ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z13.s\n"
+ ".inst 0xc1351a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z5.s\n"
+ ".inst 0xa14049c7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1331a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z3.s\n"
+ ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1321a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z2.s\n"
+ ".inst 0xc13f1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z15.s\n"
+ ".inst 0xc1371a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z7.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"addvl x14, x14, #5\n"
"8:" // Unpadded: 2 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z1.s }, p1/Z, [x13]\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z2.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z3.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z4.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13c1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
- ".inst 0xc13f1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
- "ld1w { z5.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1822 // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
- "ld1w { z6.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381840 // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
- ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc13b1841 // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
- ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc13a1842 // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
- "ld1w { z7.s }, p1/Z, [x20]\n"
+ ".inst 0xc13419c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z13.s\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc13d1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
- ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13519c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13c1862 // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
- "ld1w { z8.s }, p1/Z, [x20]\n"
- ".inst 0xc1301880 // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
- ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc13f1881 // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+ ".inst 0xa14149c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z11.s\n"
+ ".inst 0xc13a19e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z10.s\n"
+ ".inst 0xa14049c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1321a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z2.s\n"
+ ".inst 0xa04149ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1391a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc1311a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xa14049c7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13e1882 // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
- ".inst 0xc13c18a0 // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13118a1 // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
- ".inst 0xc13018a2 // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1351a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z5.s\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc1371a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z7.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"addvl x14, x14, #5\n"
"9:" // Unpadded: 1 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x13]\n"
+ "ld1w { z15.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
- ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c19e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z12.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13b1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
- ".inst 0xa14149c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13a1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13419e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z13.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ ".inst 0xc13519e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xa14049c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc1381a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z8.s\n"
+ ".inst 0xa14149c7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc13a1a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1351ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
- ".inst 0xc1341ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1331a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z3.s\n"
+ ".inst 0xc1321a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z2.s\n"
+ ".inst 0xa14149c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13c1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+ ".inst 0xc1341a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z4.s\n"
+ ".inst 0xa14049c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13e1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1361ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
- ".inst 0xc1391ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
- ".inst 0xc1311ae3 // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
- ".inst 0xc13d1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
- ".inst 0xc13c1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1351b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
- ".inst 0xc1341b03 // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13f1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc1371a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z7.s\n"
+ ".inst 0xc1311a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z1.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z0.s\n"
+ ".inst 0xc13d1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z13.s\n"
+ ".inst 0xc1351a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13e1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xc1361a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z6.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
"addvl x14, x14, #5\n"
"10:" // Unpadded: 0 priming loads
"cbz x15, 20f\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x13]\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
"sub x15, x15, #0x1\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x11, x11, #0x1\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"cmp x15, x11\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"csel x21, x15, x11, LT\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x11, x11, x21\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"cbz x21, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13719c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z7.s\n"
+ "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
- ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
- ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
- ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13c19c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z12.s\n"
+ ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z13.s\n"
+ ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xc13519c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc13919e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z9.s\n"
"ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
- "ld1w { z18.s }, p1/Z, [x13]\n"
+ ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
+ ".inst 0xa14149c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1301a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z0.s\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
- ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
- ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13b19e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z11.s\n"
+ ".inst 0xc13a19e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z10.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc13d1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z12.s\n"
+ "ld1w { z8.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1361a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13c1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
- ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1321a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z2.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc13e1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
- ".inst 0xc1381aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
- ".inst 0xc1301aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ ".inst 0xc13d1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z13.s\n"
+ ".inst 0xc1351a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z5.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1311a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z1.s\n"
+ ".inst 0xc1301a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z0.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
- ".inst 0xc1b1ca0c // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
- "st1w { z12.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc1371ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
- "st1w { z13.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- ".inst 0xc1361ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "st1w { z14.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xc1351ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
- "st1w { z15.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xc1341ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc13c1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xc1341a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z11.s\n"
+ ".inst 0xc13a1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z10.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x8, x8, #0x1\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1bdcae0 // fclamp { z0.s-z3.s }, z23.s, z29.s\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "st1w { z0.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
"addvl x14, x14, #5\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "st1w { z1.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z2.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "st1w { z3.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -450,395 +450,395 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z14.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc13519c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z5.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x14, x14, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xc13a19e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z10.s\n"
+ ".inst 0xa14049c7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z0.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z15.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z1.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z2.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z3.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
+ "mov x12, #0x4\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13e1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
- "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d19e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z13.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13b1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
- "ld1w { z5.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
- ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc13519e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xa14049c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1391840 // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
- "ld1w { z6.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1381841 // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xc13b1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13f1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
- "ld1w { z7.s }, p0/Z, [x20]\n"
- ".inst 0xc13e1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
- ".inst 0xa14049c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z12.s\n"
+ ".inst 0xc1341a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z4.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13b1880 // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1331881 // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13f1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z13.s\n"
+ ".inst 0xc13c1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z12.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"addvl x14, x14, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z15.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13c1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
- ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13f1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xc13419e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z4.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
- ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc13d19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z13.s\n"
+ ".inst 0xa04149c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13519e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13b1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z8.s\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc13a1a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z10.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13a1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa14049c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
- ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
- ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1341ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
- ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
- ".inst 0xc1321ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
- ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
- ".inst 0xc1361ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+ ".inst 0xc13a1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z10.s\n"
+ ".inst 0xc1321a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z2.s\n"
+ ".inst 0xc13c1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1311a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z1.s\n"
+ ".inst 0xc1301a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z0.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
"addvl x14, x14, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z14.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13d1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
+ "mov x12, #0x4\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13c1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13c19c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z12.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13419c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z13.s\n"
+ ".inst 0xc13519c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z5.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1391a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1381a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13919e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z9.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13819e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z8.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b19e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z11.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13b1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
- ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13a1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
- ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a19e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z10.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
- ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
- ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13c1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+ ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1301a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1301aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
- ".inst 0xc1391aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
- ".inst 0xc1311aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
- ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
- ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13b1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
- ".inst 0xc13a1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+ ".inst 0xc1361a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z6.s\n"
+ ".inst 0xc13f1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z15.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13e1a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z14.s\n"
+ ".inst 0xc13c1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xc1341a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z3.s\n"
+ ".inst 0xc1321a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z2.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
"addvl x14, x14, #5\n"
"17:" // Padded: 0 priming loads
"cbz x15, 20f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x15, x15, #0x1\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x15, x11\n"
+ "ld1w { z14.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x15, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "sub x15, x15, #0x1\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "sub x11, x11, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "cmp x15, x11\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "csel x21, x15, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x21\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
"cbz x21, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13719c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z7.s\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c19c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z12.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
- ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
- ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
- ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
- ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1301a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
- "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z13.s\n"
+ ".inst 0xc13519c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z5.s\n"
+ "ld1w { z14.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
- ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
- ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
- ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1331a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13919e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b19e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z11.s\n"
+ ".inst 0xc13a19e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z10.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1321a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
- ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
- ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xa04149ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1311a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1301a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z0.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- ".inst 0xc13c1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "mov x12, #0x4\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
"ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
- "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
- ".inst 0xc1371aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
- ".inst 0xc1361aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z3.s\n"
+ ".inst 0xc1321a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z2.s\n"
+ ".inst 0xc13d1a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z13.s\n"
+ ".inst 0xc1351a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z5.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
- "st1w { z4.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc13b1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
- "st1w { z5.s }, p1, [x28]\n"
- "add x28, x28, x26, LSL #2\n"
- ".inst 0xc1331ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "st1w { z6.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0xc13f1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
- "st1w { z7.s }, p1, [x24]\n"
- "add x24, x24, x22, LSL #2\n"
- ".inst 0xc13e1ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z12.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13b1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z11.s\n"
+ ".inst 0xc13a1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z10.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1381a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z8.s\n"
+ ".inst 0xc1301a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z0.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
"addvl x14, x14, #5\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1bdcae0 // fclamp { z0.s-z3.s }, z23.s, z29.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z0.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "st1w { z1.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z2.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z3.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
- "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
- ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
- ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
- ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
- ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
- ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
- "addvl x14, x14, #5\n"
- ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc13719c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z7.s\n"
"ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
- ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
- ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
- ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
- ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13c19c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z12.s\n"
+ ".inst 0xc13419c2 // fmla za.s[x8, 2], { z14.s-z17.s }, z4.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13d19c3 // fmla za.s[x8, 3], { z14.s-z17.s }, z13.s\n"
+ ".inst 0xc13619e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xc13519c4 // fmla za.s[x8, 4], { z14.s-z17.s }, z5.s\n"
+ ".inst 0xa14049c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13919e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z9.s\n"
+ ".inst 0xc13819e2 // fmla za.s[x8, 2], { z15.s-z18.s }, z8.s\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1371a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc13b19e3 // fmla za.s[x8, 3], { z15.s-z18.s }, z11.s\n"
+ ".inst 0xc13a19e4 // fmla za.s[x8, 4], { z15.s-z18.s }, z10.s\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
"addvl x14, x14, #5\n"
- ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
"ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
- ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
- ".inst 0xc1341a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
- ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
- ".inst 0xc1331a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
- ".inst 0xc1321a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xc1331a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s\n"
+ ".inst 0xc1301a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z0.s\n"
+ ".inst 0xc1321a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z2.s\n"
+ ".inst 0xa14149c7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13e1a03 // fmla za.s[x8, 3], { z16.s-z19.s }, z14.s\n"
+ ".inst 0xc1361a04 // fmla za.s[x8, 4], { z16.s-z19.s }, z6.s\n"
".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
- ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
- ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
- ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
- ".inst 0xc13b1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
- ".inst 0xc13a1aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
- "st1w { z4.s }, p1, [x9]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xc1391a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z9.s\n"
+ ".inst 0xc1381a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z8.s\n"
+ ".inst 0xc13d1a23 // fmla za.s[x8, 3], { z17.s-z20.s }, z13.s\n"
+ ".inst 0xc13c1a24 // fmla za.s[x8, 4], { z17.s-z20.s }, z12.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc1371a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z7.s\n"
+ ".inst 0xc1331a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z3.s\n"
+ ".inst 0xc1321a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z2.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1bdcae8 // fclamp { z8.s-z11.s }, z23.s, z29.s\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ "st1w { z8.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13d1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
- "st1w { z5.s }, p1, [x28]\n"
+ "st1w { z9.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc13c1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
- "st1w { z6.s }, p1, [x25]\n"
+ "st1w { z10.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc1331ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
- "st1w { z7.s }, p1, [x24]\n"
+ "st1w { z11.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc1321ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
- "add x8, x8, #0x1\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"20:" // Main loop skip tail
"cbz x11, 22f\n"
"21:" // Right padding loop
".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x11, x11, #0x1\n"
- ".inst 0xc1b1ca00 // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ ".inst 0xc1bdcae0 // fclamp { z0.s-z3.s }, z23.s, z29.s\n"
"st1w { z0.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"st1w { z1.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"st1w { z2.s }, p1, [x25]\n"
@@ -848,12 +848,12 @@ void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
"bgt 21b\n"
"22:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x16\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
index 81ad8e5833..233b6bd61a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,67 +72,67 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0xb\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x5\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x5\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x6\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z28.s, #0x0\n"
+ "fmov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
"ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x16, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- "mov z29.d, z28.d\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "orr x23, x7, x23, LSL #20\n"
"mov x22, #0xb\n"
+ "add x20, x6, x5\n"
+ "mov z21.d, z20.d\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
"ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "add x21, x6, x5\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "mov z30.d, z28.d\n"
- "mov z31.d, z28.d\n"
- ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"mov x8, #0x0\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x16, #0x1\n"
"ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x23, x23, #0x2\n"
+ "orr x20, x20, %x[ld_in_col], LSL #18\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "orr x20, x7, x20, LSL #20\n"
+ "madd x21, x21, x6, x14\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "lsl x20, x20, #0x2\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x6, x14\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"3:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x14, x6, x20, x14\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"mov x22, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x6, x21, x14\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc0040e83 // mova za.d[x8, #3], { z20.d-z23.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
"ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -141,21 +141,21 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1a3c850 // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
"lsr x21, x21, #0x1\n"
"sub x13, x13, x21\n"
+ ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z16.s }, p1, [x11]\n"
+ "st1w { z24.s }, p1, [x11]\n"
"add x11, x11, x9, LSL #2\n"
- "st1w { z17.s }, p1, [x10]\n"
+ "st1w { z25.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z18.s }, p1, [x27]\n"
+ "st1w { z26.s }, p1, [x27]\n"
"add x27, x27, x25, LSL #2\n"
- "st1w { z19.s }, p1, [x26]\n"
+ "st1w { z27.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -171,331 +171,331 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z9.s }, p1/Z, [x14]\n"
+ "ld1w { z24.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
- "ld1w { z13.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+ ".inst 0xa04049e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1341940 // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ ".inst 0xc1321b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z2.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1311940 // fmla za.s[x8, 0], { z10.s-z13.s }, z1.s\n"
+ ".inst 0xc1341b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z4.s\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
"7:" // Unpadded: 3 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x14]\n"
+ "ld1w { z28.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
"ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z15.s\n"
+ ".inst 0xa04049e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
"ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z12.s\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1371900 // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
- ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1331ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z3.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13b1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
- ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13e1900 // fmla za.s[x8, 0], { z8.s-z11.s }, z14.s\n"
+ ".inst 0xc1351bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z5.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"8:" // Unpadded: 2 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x14]\n"
+ "ld1w { z25.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
- ".inst 0xc1341a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13019c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13719c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
- ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1361b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z6.s\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
- ".inst 0xc1381a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301940 // fmla za.s[x8, 0], { z10.s-z13.s }, z0.s\n"
".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341941 // fmla za.s[x8, 1], { z10.s-z13.s }, z4.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13e1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
- ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xc1321b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z2.s\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1361b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z6.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381960 // fmla za.s[x8, 0], { z11.s-z14.s }, z8.s\n"
+ ".inst 0xc1311961 // fmla za.s[x8, 1], { z11.s-z14.s }, z1.s\n"
+ ".inst 0xc1321b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z4.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"9:" // Unpadded: 1 priming loads
"add x20, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z7.s }, p1/Z, [x14]\n"
+ "ld1w { z24.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z8.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z4.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z9.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z10.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13b18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
- ".inst 0xc13518e1 // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
- "ld1w { z21.s }, p1/Z, [x20]\n"
+ "ld1w { z6.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
- "ld1w { z11.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
- ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13e1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z14.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z15.s\n"
+ ".inst 0xa14049e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13d1900 // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
- ".inst 0xc1311901 // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13e1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
- ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc1311860 // fmla za.s[x8, 0], { z3.s-z6.s }, z1.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13c1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z12.s\n"
".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc13f1921 // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
- ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
- ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
+ ".inst 0xc1391b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z9.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13a1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1311880 // fmla za.s[x8, 0], { z4.s-z7.s }, z1.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13f1881 // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13d1b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z13.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"10:" // Unpadded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x14]\n"
+ "ld1w { z9.s }, p1/Z, [x14]\n"
"sub x16, x16, #0x2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x13, x13, #0x1\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
+ "ld1w { z10.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"lsr x20, x16, #0x1\n"
- "ld1w { z17.s }, p1/Z, [x21]\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"cmp x20, x13\n"
- "ld1w { z24.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"csel x23, x20, x13, LT\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"and x16, x16, #0x1\n"
- "ld1w { z19.s }, p1/Z, [x21]\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x13, x13, x23\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
"cbz x23, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"add x22, x14, %x[ld_in_row], LSL #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
- ".inst 0xa14149e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
- ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
- "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
- ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1361921 // fmla za.s[x8, 1], { z9.s-z12.s }, z6.s\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371922 // fmla za.s[x8, 2], { z9.s-z12.s }, z7.s\n"
".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
- "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1311ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
- ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1381ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
- ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
- "ld1w { z15.s }, p1/Z, [x14]\n"
+ "ld1w { z31.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "add x20, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
- "ld1w { z22.s }, p1/Z, [x22]\n"
+ ".inst 0xc1351b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z5.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z4.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0xc1381940 // fmla za.s[x8, 0], { z10.s-z13.s }, z8.s\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1321941 // fmla za.s[x8, 1], { z10.s-z13.s }, z2.s\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1361942 // fmla za.s[x8, 2], { z10.s-z13.s }, z6.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x22]\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1371b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z7.s\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
- ".inst 0xc1301b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
- ".inst 0xa0414aa6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc13c1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
- "ld1w { z17.s }, p1/Z, [x22]\n"
+ ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xc1321961 // fmla za.s[x8, 1], { z11.s-z14.s }, z2.s\n"
+ ".inst 0xa0414a82 // ld1w { z2.s-z3.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1381962 // fmla za.s[x8, 2], { z11.s-z14.s }, z8.s\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x22]\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xa1404a82 // ld1w { z2.s, z10.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xa1404aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
- "addvl x21, x21, #5\n"
- ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
- "st1w { z8.s }, p1, [x11]\n"
- "ld1w { z18.s }, p1/Z, [x22]\n"
+ "addvl x20, x20, #5\n"
+ "ld1w { z2.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0414a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1331be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z3.s\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1b0ca2c // fclamp { z12.s-z15.s }, z17.s, z16.s\n"
+ ".inst 0xc13a1be1 // fmla za.s[x8, 1], { z31.s-z2.s }, z10.s\n"
+ "ld1w { z3.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0404a8a // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0414a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1371b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z7.s\n"
+ "ld1w { z30.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+ "st1w { z12.s }, p1, [x11]\n"
+ ".inst 0xc1351b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z5.s\n"
+ ".inst 0xa1404a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
"add x11, x11, x9, LSL #2\n"
- ".inst 0xc13c19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
- ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
- "st1w { z9.s }, p1, [x10]\n"
+ "st1w { z13.s }, p1, [x10]\n"
+ ".inst 0xa0414a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xa1414aa6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- "addvl x21, x21, #5\n"
- "st1w { z10.s }, p1, [x27]\n"
+ "st1w { z14.s }, p1, [x27]\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
"add x27, x27, x25, LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
- "st1w { z11.s }, p1, [x26]\n"
- ".inst 0xc13f1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
- "ld1w { z19.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "st1w { z15.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xa0404aae // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
- ".inst 0xc13f1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
- ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- "addvl x21, x21, #5\n"
- ".inst 0xc13c1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
- "ld1w { z26.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa0404aac // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
- ".inst 0xc13d1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
- ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- "addvl x21, x21, #5\n"
- ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
- "ld1w { z20.s }, p1/Z, [x22]\n"
- ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
- ".inst 0xc13f1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
- ".inst 0xa0414aaa // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+ ".inst 0xc1391800 // fmla za.s[x8, 0], { z0.s-z3.s }, z9.s\n"
+ ".inst 0xa0414a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc13b1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z11.s\n"
+ ".inst 0xa1404a86 // ld1w { z6.s, z14.s }, pn10.b/Z, [x20]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z22.s }, p1/Z, [x14]\n"
+ "ld1w { z9.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z5.s\n"
+ "ld1w { z10.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z12.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1371820 // fmla za.s[x8, 0], { z1.s-z4.s }, z7.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13e1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z14.s\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -509,625 +509,625 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z9.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1371ba0 // fmla za.s[x8, 0], { z29.s-z0.s }, z7.s\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1371ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361940 // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1341900 // fmla za.s[x8, 0], { z8.s-z11.s }, z4.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z6.s\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ ".inst 0xc13e1be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z14.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z22.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z30.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z10.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13f1bc0 // fmla za.s[x8, 0], { z30.s-z1.s }, z15.s\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc13c1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z12.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13f1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z14.s\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13f1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
- ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ "ld1w { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13d1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z13.s\n"
+ ".inst 0xc13b1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z11.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1361b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
- ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xa14049e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
- ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc13a1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
- ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
- ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1301a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
+ ".inst 0xc1301960 // fmla za.s[x8, 0], { z11.s-z14.s }, z0.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z1.s\n"
+ ".inst 0xa14149e3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1321b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z2.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1341981 // fmla za.s[x8, 1], { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1331b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z3.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1301b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z0.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
"ld1w { z8.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z22.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"mov x12, #0x8\n"
- ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13e1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z14.s\n"
"ld1w { z11.s }, p0/Z, [x20]\n"
- ".inst 0xc1351a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- ".inst 0xc1311900 // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z15.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13f1901 // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
- ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311900 // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13c1901 // fmla za.s[x8, 1], { z8.s-z11.s }, z12.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13e1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
"ld1w { z12.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
- ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa14149e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
- ".inst 0xc1371921 // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
- ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z3.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc13f1b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z15.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13d1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
- ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc13d1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z13.s\n"
+ ".inst 0xc1351921 // fmla za.s[x8, 1], { z9.s-z12.s }, z5.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1331b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z3.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"17:" // Padded: 0 priming loads
"cmp x16, #0x2\n"
"blt 20f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z22.s }, p0/Z, [x14]\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x16, x16, #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "lsr x20, x16, #0x1\n"
+ "cmp x20, x13\n"
+ "and x16, x16, #0x1\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x23, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "sub x13, x13, x23\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
- "mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
- "sub x16, x16, #0x2\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "sub x13, x13, #0x1\n"
- "ld1w { z26.s }, p0/Z, [x21]\n"
- "lsr x20, x16, #0x1\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "cmp x20, x13\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
- "csel x23, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col], LSL #2\n"
- "and x16, x16, #0x1\n"
- "sub x13, x13, x23\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
"cbz x23, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
"add x22, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361921 // fmla za.s[x8, 1], { z9.s-z12.s }, z6.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
- ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
- ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
- ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1371922 // fmla za.s[x8, 2], { z9.s-z12.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
+ ".inst 0xc1351b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z5.s\n"
+ "ld1w { z8.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1301b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1351ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
- ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
- "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13c1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
- "ld1w { z12.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z4.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1321941 // fmla za.s[x8, 1], { z10.s-z13.s }, z2.s\n"
+ ".inst 0xa04149e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1361942 // fmla za.s[x8, 2], { z10.s-z13.s }, z6.s\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z13.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+ "mov x12, #0x4\n"
+ ".inst 0xa14049e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z15.s\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z0.s\n"
+ ".inst 0xc1311b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z1.s\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
- ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc13e1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
+ ".inst 0xa0414a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1321962 // fmla za.s[x8, 2], { z11.s-z14.s }, z2.s\n"
+ "ld1w { z10.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0404a82 // ld1w { z2.s-z3.s }, pn10.b/Z, [x20]\n"
+ "addvl x20, x20, #5\n"
".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
"add x8, x8, #0x1\n"
- "addvl x20, x20, #5\n"
- ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xa0404a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xa1414a86 // ld1w { z6.s, z14.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "ld1w { z11.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x11]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
"mov x12, #0x8\n"
- ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
- ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "ld1w { z2.s }, p0/Z, [x22]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371900 // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc1331901 // fmla za.s[x8, 1], { z8.s-z11.s }, z3.s\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+ "st1w { z24.s }, p1, [x11]\n"
"add x11, x11, x9, LSL #2\n"
- ".inst 0xc1311a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
- ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa0414a8c // ld1w { z12.s-z13.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"addvl x20, x20, #5\n"
"st1w { z25.s }, p1, [x10]\n"
- "ld1w { z15.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1311980 // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
- ".inst 0xc1391981 // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0404a8a // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
- ".inst 0xc13b1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
"add x10, x10, x28, LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x22]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
"st1w { z26.s }, p1, [x27]\n"
- ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "addvl x20, x20, #5\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ ".inst 0xc13e1be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z14.s\n"
+ ".inst 0xa1414a86 // ld1w { z6.s, z14.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"add x27, x27, x25, LSL #2\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"st1w { z27.s }, p1, [x26]\n"
- ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
- "mov x12, #0x0\n"
- ".inst 0xc13919a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
- "add x26, x26, x24, LSL #2\n"
- ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc13f1be1 // fmla za.s[x8, 1], { z31.s-z2.s }, z15.s\n"
+ ".inst 0xa1404a87 // ld1w { z7.s, z15.s }, pn10.b/Z, [x20]\n"
"addvl x20, x20, #5\n"
- ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
- "ld1w { z21.s }, p0/Z, [x22]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc13d1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z13.s\n"
+ ".inst 0xa0414a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1w { z13.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- "ld1w { z22.s }, p0/Z, [x14]\n"
+ ".inst 0xc1351921 // fmla za.s[x8, 1], { z9.s-z12.s }, z5.s\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13e1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z14.s\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xa0404a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
- ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
- ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
- "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z15.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371940 // fmla za.s[x8, 0], { z10.s-z13.s }, z7.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1351941 // fmla za.s[x8, 1], { z10.s-z13.s }, z5.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z11.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
"add x21, x14, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
- ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
- ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
- ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1361921 // fmla za.s[x8, 1], { z9.s-z12.s }, z6.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xa04149e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371922 // fmla za.s[x8, 2], { z9.s-z12.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
+ ".inst 0xc1351b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z5.s\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x14, x14, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
- ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc1311ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xc1301b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z4.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
- "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
- "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
- "ld1w { z22.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13c1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
- "ld1w { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381940 // fmla za.s[x8, 0], { z10.s-z13.s }, z8.s\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1321941 // fmla za.s[x8, 1], { z10.s-z13.s }, z2.s\n"
+ ".inst 0xa04149e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1361942 // fmla za.s[x8, 2], { z10.s-z13.s }, z6.s\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z23.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1311b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
+ "mov x12, #0x4\n"
+ ".inst 0xa04049e2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z15.s\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z0.s\n"
+ ".inst 0xc1371b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z7.s\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
- ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1371b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
- "ld1w { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+ ".inst 0xc1341961 // fmla za.s[x8, 1], { z11.s-z14.s }, z4.s\n"
+ ".inst 0xa0414a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1321962 // fmla za.s[x8, 2], { z11.s-z14.s }, z2.s\n"
+ "ld1w { z11.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z24.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
- ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
- "add x8, x8, #0x1\n"
+ ".inst 0xa0404a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
"addvl x20, x20, #5\n"
- ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
- "ld1w { z19.s }, p0/Z, [x21]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xa0404a82 // ld1w { z2.s-z3.s }, pn10.b/Z, [x20]\n"
+ "ld1w { z1.s }, p0/Z, [x21]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z8.s }, p1, [x11]\n"
- "mov x12, #0x8\n"
- ".inst 0xc13f1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
- ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
- "add x11, x11, x9, LSL #2\n"
- ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
- ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"addvl x20, x20, #5\n"
- "st1w { z9.s }, p1, [x10]\n"
- "ld1w { z25.s }, p0/Z, [x21]\n"
+ "ld1w { z12.s }, p0/Z, [x21]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1b0ca38 // fclamp { z24.s-z27.s }, z17.s, z16.s\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1351920 // fmla za.s[x8, 0], { z9.s-z12.s }, z5.s\n"
+ "ld1w { z2.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1381ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
- ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371921 // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "st1w { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xa1414a86 // ld1w { z6.s, z14.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z25.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ ".inst 0xc13f1be0 // fmla za.s[x8, 0], { z31.s-z2.s }, z15.s\n"
+ ".inst 0xa0414a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z27.s }, p1, [x26]\n"
+ ".inst 0xc1331be1 // fmla za.s[x8, 1], { z31.s-z2.s }, z3.s\n"
".inst 0xa0404a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
- ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
- "add x10, x10, x28, LSL #2\n"
- "st1w { z10.s }, p1, [x27]\n"
- ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
"addvl x20, x20, #5\n"
- ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
- "add x27, x27, x25, LSL #2\n"
- "ld1w { z26.s }, p0/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "st1w { z11.s }, p1, [x26]\n"
- ".inst 0xa1404a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
- ".inst 0xc13c1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xa1414a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "addvl x20, x20, #5\n"
- ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
- "ld1w { z21.s }, p0/Z, [x21]\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
- ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
- ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p0/Z, [x21]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z14.s\n"
+ ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1391941 // fmla za.s[x8, 1], { z10.s-z13.s }, z9.s\n"
+ "ld1w { z14.s }, p0/Z, [x21]\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
"ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
- ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
- ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
- ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- "addvl x15, x15, #5\n"
+ ".inst 0xc1351800 // fmla za.s[x8, 0], { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1371801 // fmla za.s[x8, 1], { z0.s-z3.s }, z7.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13f1960 // fmla za.s[x8, 0], { z11.s-z14.s }, z15.s\n"
+ ".inst 0xa04149e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1391961 // fmla za.s[x8, 1], { z11.s-z14.s }, z9.s\n"
".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x15, #4, MUL VL]\n"
"addvl x15, x15, #5\n"
"20:" // Main loop skip tail
"cbz x16, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x14]\n"
"add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z25.s }, p0/Z, [x14]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1361ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1361b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z6.s\n"
+ ".inst 0xc1371b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
- ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- "addvl x15, x15, #5\n"
- ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351960 // fmla za.s[x8, 0], { z11.s-z14.s }, z5.s\n"
+ ".inst 0xa14149e2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z10.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301961 // fmla za.s[x8, 1], { z11.s-z14.s }, z0.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1371ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
- ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
- ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341962 // fmla za.s[x8, 2], { z11.s-z14.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xa14149e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
"addvl x15, x15, #5\n"
- ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0xc13a1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
- "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
- ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
- ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z1.s\n"
".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
- ".inst 0xc1301b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
- ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
- "st1w { z24.s }, p1, [x11]\n"
+ ".inst 0xc13a1980 // fmla za.s[x8, 0], { z12.s-z15.s }, z10.s\n"
+ ".inst 0xa04149e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1301981 // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1331982 // fmla za.s[x8, 2], { z12.s-z15.s }, z3.s\n"
+ ".inst 0xc1361b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z6.s\n"
+ ".inst 0xc1341b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z4.s\n"
+ ".inst 0xc1381b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z8.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1b0ca24 // fclamp { z4.s-z7.s }, z17.s, z16.s\n"
+ "st1w { z4.s }, p1, [x11]\n"
"add x11, x11, x9, LSL #2\n"
- ".inst 0xc1301a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
- "st1w { z25.s }, p1, [x10]\n"
+ "st1w { z5.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1381a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
- "add x8, x8, #0x1\n"
- "st1w { z26.s }, p1, [x27]\n"
+ "st1w { z6.s }, p1, [x27]\n"
"add x27, x27, x25, LSL #2\n"
- "st1w { z27.s }, p1, [x26]\n"
+ "st1w { z7.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"21:" // Tail input: End
"cbz x13, 23f\n"
"22:" // Right padding loop
".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1b0ca28 // fclamp { z8.s-z11.s }, z17.s, z16.s\n"
"st1w { z8.s }, p1, [x11]\n"
"add x11, x11, x9, LSL #2\n"
- ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
"st1w { z9.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
"st1w { z10.s }, p1, [x27]\n"
@@ -1137,12 +1137,12 @@ void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
"bgt 22b\n"
"23:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
index be82e04613..412d786d8a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,99 +72,99 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x6\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x7\n"
"ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z26.s, #0x0\n"
+ "fmov z4.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x21\n"
- "fmov z6.s, #0x0\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_weights]]\n"
+ "fmov z1.s, #0x0\n"
+ "fmov z12.s, #0x0\n"
+ "mov x24, #0x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "fmov z15.s, #0x0\n"
+ "add x20, x17, x7\n"
+ "lsl x23, %x[ld_in_row], #0x2\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "mov z5.d, z4.d\n"
+ "mov x8, #0x0\n"
+ "sub x24, x24, x20\n"
+ "mov x22, x25\n"
+ "incb x25\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ld1w { z31.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #3\n"
+ "sub x20, x14, #0x1\n"
+ "ld1w { z21.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #3\n"
+ "orr x21, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z10.s }, p2/Z, [x22]\n"
+ "mov x20, x25\n"
+ "incb x25\n"
+ ".inst 0x648aabe1 // bfcvtnt z1.h, p2/M, z31.s\n"
+ ".inst 0x658aabe2 // bfcvt z2.h, p2/M, z31.s\n"
+ "ld1w { z8.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #3\n"
- "incb x21\n"
+ ".inst 0x658aaaa3 // bfcvt z3.h, p2/M, z21.s\n"
+ "orr x21, x16, x21, LSL #20\n"
+ "madd x23, x23, x17, x13\n"
+ ".inst 0x658aa94e // bfcvt z14.h, p2/M, z10.s\n"
+ "lsl x21, x21, #0x2\n"
+ ".inst 0x658aa907 // bfcvt z7.h, p2/M, z8.s\n"
+ ".inst 0x648aa90c // bfcvtnt z12.h, p2/M, z8.s\n"
+ ".inst 0x648aaaa2 // bfcvtnt z2.h, p2/M, z21.s\n"
"ld1w { z29.s }, p2/Z, [x20]\n"
- ".inst 0x648aa9e6 // bfcvtnt z6.h, p2/M, z15.s\n"
"incb x20, ALL, MUL #3\n"
- "ld1w { z30.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- ".inst 0x658aa9e5 // bfcvt z5.h, p2/M, z15.s\n"
- "ld1w { z14.s }, p2/Z, [x20]\n"
- ".inst 0x658aaba8 // bfcvt z8.h, p2/M, z29.s\n"
- "fmov z11.s, #0x0\n"
+ ".inst 0x648aa943 // bfcvtnt z3.h, p2/M, z10.s\n"
+ "ld1w { z20.s }, p2/Z, [x20]\n"
+ "mov x20, x25\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #3\n"
- ".inst 0x658aa9ca // bfcvt z10.h, p2/M, z14.s\n"
- ".inst 0x648aaba5 // bfcvtnt z5.h, p2/M, z29.s\n"
- "incb x21\n"
- "ld1w { z24.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- ".inst 0x648aabc8 // bfcvtnt z8.h, p2/M, z30.s\n"
- ".inst 0x658aabcc // bfcvt z12.h, p2/M, z30.s\n"
- "ld1w { z28.s }, p2/Z, [x20]\n"
- "mov x21, x21\n"
- ".inst 0x648aa9cb // bfcvtnt z11.h, p2/M, z14.s\n"
- "ld1w { z20.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
- ".inst 0x658aab09 // bfcvt z9.h, p2/M, z24.s\n"
- "ld1w { z15.s }, p2/Z, [x21]\n"
- "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- "incb x21, ALL, MUL #3\n"
- "fmov z14.s, #0x0\n"
- ".inst 0x658aaa81 // bfcvt z1.h, p2/M, z20.s\n"
- "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aa9e7 // bfcvt z7.h, p2/M, z15.s\n"
- ".inst 0x648aab89 // bfcvtnt z9.h, p2/M, z28.s\n"
- "sub x20, x14, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- ".inst 0x658aab84 // bfcvt z4.h, p2/M, z28.s\n"
- "ld1w { z29.s }, p2/Z, [x21]\n"
- "orr x23, x16, x23, LSL #20\n"
- "mov x22, #0x6\n"
- "add x21, x17, x7\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "mov z27.d, z26.d\n"
- ".inst 0x648aaa8e // bfcvtnt z14.h, p2/M, z20.s\n"
- ".inst 0x648aa9e1 // bfcvtnt z1.h, p2/M, z15.s\n"
".inst 0x648aaba7 // bfcvtnt z7.h, p2/M, z29.s\n"
- "mov x8, #0x0\n"
- "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
- "lsl x23, x23, #0x2\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x17, x13\n"
+ ".inst 0x658aaba0 // bfcvt z0.h, p2/M, z29.s\n"
+ "ld1w { z25.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aaa86 // bfcvt z6.h, p2/M, z20.s\n"
+ ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aab28 // bfcvt z8.h, p2/M, z25.s\n"
+ ".inst 0x648aaa80 // bfcvtnt z0.h, p2/M, z20.s\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ ".inst 0x648aab29 // bfcvtnt z9.h, p2/M, z25.s\n"
+ ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
+ ".inst 0x658aa94a // bfcvt z10.h, p2/M, z10.s\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x24, x24, #0x1\n"
+ ".inst 0xf8b54afc // rprfm pldstrm, x21, [x23]\n"
+ "add x23, x23, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x20, x13\n"
- ".inst 0xc0040b40 // mova za.d[x8, #0], { z26.d-z27.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040b41 // mova za.d[x8, #1], { z26.d-z27.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040880 // mova za.d[x8, #0], { z4.d-z5.d }\n"
"mov x10, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x13, x17, x21, x13\n"
+ ".inst 0xc0040881 // mova za.d[x8, #1], { z4.d-z5.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040882 // mova za.d[x8, #2], { z4.d-z5.d }\n"
"ldp x9, x28, [x22], #0x10\n"
- ".inst 0xc0040b42 // mova za.d[x8, #2], { z26.d-z27.d }\n"
+ ".inst 0xc0040883 // mova za.d[x8, #3], { z4.d-z5.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0040b43 // mova za.d[x8, #3], { z26.d-z27.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ ".inst 0xc0040884 // mova za.d[x8, #4], { z4.d-z5.d }\n"
+ ".inst 0xc0040885 // mova za.d[x8, #5], { z4.d-z5.d }\n"
"ldp x25, x24, [x22], #0x10\n"
- ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"ldp x23, x22, [x20], #0x10\n"
"cbz x21, 5f\n"
"cmp x21, x10\n"
@@ -172,19 +172,19 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x10, x10, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060814 // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+ ".inst 0xc006081c // mova { z28.d-z29.d }, za.d[x8, #0]\n"
"sub x11, x11, x21\n"
- ".inst 0xc0060836 // mova { z22.d-z23.d }, za.d[x8, #1]\n"
- ".inst 0xc1adcb34 // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
+ ".inst 0xc006083e // mova { z30.d-z31.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcafc // fclamp { z28.s-z31.s }, z23.s, z13.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z20.s }, p1, [x9]\n"
+ "st1w { z28.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z22.s }, p1, [x28]\n"
+ "st1w { z30.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- "st1w { z21.s }, p1, [x25]\n"
+ "st1w { z29.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "st1w { z23.s }, p1, [x24]\n"
+ "st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -196,124 +196,124 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa3e // bfcvt z30.h, p2/M, z17.s\n"
+ "ld1w { z21.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab9e // bfcvtnt z30.h, p2/M, z28.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa9ff // bfcvtnt z31.h, p2/M, z15.s\n"
- ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaabb // bfcvt z27.h, p2/M, z21.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
- ".inst 0xc12613d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
- ".inst 0x648aa9e0 // bfcvtnt z0.h, p2/M, z15.s\n"
- ".inst 0xc12c13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
- ".inst 0xc12813f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
+ ".inst 0x658aab1c // bfcvt z28.h, p2/M, z24.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa7b // bfcvtnt z27.h, p2/M, z19.s\n"
+ ".inst 0x658aaadd // bfcvt z29.h, p2/M, z22.s\n"
+ ".inst 0x648aaa3c // bfcvtnt z28.h, p2/M, z17.s\n"
+ ".inst 0x648aaa1d // bfcvtnt z29.h, p2/M, z16.s\n"
+ ".inst 0xc1221370 // bfdot za.s[x8, 0], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xc1211371 // bfdot za.s[x8, 1], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc12e1390 // bfdot za.s[x8, 0], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc1231391 // bfdot za.s[x8, 1], { z28.h-z29.h }, z3.h\n"
"7:" // Unpadded: 1 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z31.s }, p1/Z, [x13]\n"
- ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "ld1w { z24.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
- ".inst 0xc12a11f0 // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab13 // bfcvt z19.h, p2/M, z24.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaad1 // bfcvt z17.h, p2/M, z22.s\n"
- ".inst 0xc12b11f1 // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
- ".inst 0xc12511f2 // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
- ".inst 0xc12611f3 // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
- ".inst 0xc1241210 // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
- ".inst 0xc1291211 // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
- ".inst 0xc12c1212 // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
- ".inst 0xc1281213 // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
+ ".inst 0x658aaa94 // bfcvt z20.h, p2/M, z20.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ ".inst 0x648aabd3 // bfcvtnt z19.h, p2/M, z30.s\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ ".inst 0x648aaa54 // bfcvtnt z20.h, p2/M, z18.s\n"
+ ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ ".inst 0xc1271270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z7.h\n"
+ ".inst 0xc12c1271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z12.h\n"
+ ".inst 0xc1221272 // bfdot za.s[x8, 2], { z19.h-z20.h }, z2.h\n"
+ ".inst 0xc1211273 // bfdot za.s[x8, 3], { z19.h-z20.h }, z1.h\n"
+ ".inst 0xc1261290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xc1201291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc12e1292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1231293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z3.h\n"
"8:" // Unpadded: 0 priming loads
"cbz x14, 16f\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
"sub x14, x14, #0x1\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x11, x11, #0x1\n"
- ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa817 // bfcvt z23.h, p2/M, z0.s\n"
"cmp x14, x11\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa3c // bfcvt z28.h, p2/M, z17.s\n"
"csel x21, x14, x11, LT\n"
- ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa818 // bfcvt z24.h, p2/M, z0.s\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ ".inst 0x658aaabd // bfcvt z29.h, p2/M, z21.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"sub x11, x11, x21\n"
+ ".inst 0x658aaa9e // bfcvt z30.h, p2/M, z20.s\n"
+ ".inst 0x648aaa1c // bfcvtnt z28.h, p2/M, z16.s\n"
+ ".inst 0x648aab7d // bfcvtnt z29.h, p2/M, z27.s\n"
+ ".inst 0x648aaa3e // bfcvtnt z30.h, p2/M, z17.s\n"
"cbz x21, 15f\n"
"9:" // Unpadded: Main loop
- ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc1291390 // bfdot za.s[x8, 0], { z28.h-z29.h }, z9.h\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x13]\n"
+ "ld1w { z19.s }, p1/Z, [x13]\n"
"subs x21, x21, #0x1\n"
- ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
- "ld1w { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc12f1391 // bfdot za.s[x8, 1], { z28.h-z29.h }, z15.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
- ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc1271392 // bfdot za.s[x8, 2], { z28.h-z29.h }, z7.h\n"
+ ".inst 0xc12c1393 // bfdot za.s[x8, 3], { z28.h-z29.h }, z12.h\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc12a13b0 // bfdot za.s[x8, 0], { z29.h-z30.h }, z10.h\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
- ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
- ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
- ".inst 0x648aaa96 // bfcvtnt z22.h, p2/M, z20.s\n"
- ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
- ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
- ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
- ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
- ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
- ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
+ ".inst 0xc12813b1 // bfdot za.s[x8, 1], { z29.h-z30.h }, z8.h\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc1221394 // bfdot za.s[x8, 4], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xc1211395 // bfdot za.s[x8, 5], { z28.h-z29.h }, z1.h\n"
+ ".inst 0x658aaa7c // bfcvt z28.h, p2/M, z19.s\n"
+ ".inst 0xc12613b2 // bfdot za.s[x8, 2], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc12013b3 // bfdot za.s[x8, 3], { z29.h-z30.h }, z0.h\n"
+ ".inst 0xc12e13b4 // bfdot za.s[x8, 4], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc12313b5 // bfdot za.s[x8, 5], { z29.h-z30.h }, z3.h\n"
+ ".inst 0x658aaa3d // bfcvt z29.h, p2/M, z17.s\n"
+ ".inst 0x658aaa1e // bfcvt z30.h, p2/M, z16.s\n"
".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
- ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ ".inst 0x648aaa5c // bfcvtnt z28.h, p2/M, z18.s\n"
".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ ".inst 0xc0040884 // mova za.d[x8, #4], { z4.d-z5.d }\n"
+ ".inst 0xc0040885 // mova za.d[x8, #5], { z4.d-z5.d }\n"
+ ".inst 0x648aaabd // bfcvtnt z29.h, p2/M, z21.s\n"
+ ".inst 0x648aaa9e // bfcvtnt z30.h, p2/M, z20.s\n"
+ ".inst 0xc1adcaf0 // fclamp { z16.s-z19.s }, z23.s, z13.s\n"
"st1w { z16.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z18.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
"st1w { z17.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"st1w { z19.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 9b\n"
@@ -325,186 +325,186 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1251290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
+ ".inst 0x658aaa34 // bfcvt z20.h, p2/M, z17.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ ".inst 0x658aabf5 // bfcvt z21.h, p2/M, z31.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
- ".inst 0xc1261291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12c12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
- ".inst 0xc12812b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
+ ".inst 0xc1221270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z2.h\n"
+ ".inst 0xc1211271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z1.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xc12e1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1231291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z3.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa34 // bfcvt z20.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12a1270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
+ ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
- ".inst 0xc12b1271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
- ".inst 0xc1251272 // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1261273 // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
- ".inst 0xc1241290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
- ".inst 0xc1291291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
- ".inst 0xc12c1292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
- ".inst 0xc1281293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
+ ".inst 0x658aab36 // bfcvt z22.h, p2/M, z25.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1271290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z7.h\n"
+ ".inst 0xc12c1291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z12.h\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1221292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z2.h\n"
+ ".inst 0xc1211293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xc12612b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xc12012b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z0.h\n"
+ ".inst 0xc12e12b2 // bfdot za.s[x8, 2], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xc12312b3 // bfdot za.s[x8, 3], { z21.h-z22.h }, z3.h\n"
"13:" // Padded: 0 priming loads
"cbz x14, 16f\n"
"mov x12, #0x0\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x14, x14, #0x1\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x14, x11\n"
"ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa1c // bfcvt z28.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa3c // bfcvtnt z28.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
+ ".inst 0x658aaa1d // bfcvt z29.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa3d // bfcvtnt z29.h, p2/M, z17.s\n"
+ ".inst 0x658aaa1e // bfcvt z30.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x14, x14, #0x1\n"
- ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
- "sub x11, x11, #0x1\n"
- "cmp x14, x11\n"
- "csel x21, x14, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "sub x11, x11, x21\n"
+ ".inst 0x648aaa1e // bfcvtnt z30.h, p2/M, z16.s\n"
"cbz x21, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z20.s }, p0/Z, [x13]\n"
- ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1291390 // bfdot za.s[x8, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc12f1391 // bfdot za.s[x8, 1], { z28.h-z29.h }, z15.h\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1271392 // bfdot za.s[x8, 2], { z28.h-z29.h }, z7.h\n"
+ ".inst 0xc12c1393 // bfdot za.s[x8, 3], { z28.h-z29.h }, z12.h\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12a13b0 // bfdot za.s[x8, 0], { z29.h-z30.h }, z10.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12813b1 // bfdot za.s[x8, 1], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xc1221394 // bfdot za.s[x8, 4], { z28.h-z29.h }, z2.h\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1211395 // bfdot za.s[x8, 5], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc12613b2 // bfdot za.s[x8, 2], { z29.h-z30.h }, z6.h\n"
+ ".inst 0x658aaa3c // bfcvt z28.h, p2/M, z17.s\n"
+ ".inst 0xc12013b3 // bfdot za.s[x8, 3], { z29.h-z30.h }, z0.h\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
- "mov x12, #0x4\n"
- ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc12e13b4 // bfdot za.s[x8, 4], { z29.h-z30.h }, z14.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc12313b5 // bfdot za.s[x8, 5], { z29.h-z30.h }, z3.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa1c // bfcvtnt z28.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "subs x21, x21, #0x1\n"
- ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
- ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
- ".inst 0x648aaa76 // bfcvtnt z22.h, p2/M, z19.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
- ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
- ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
- ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
- ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
- ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
- ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
- ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
- ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
- ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0x658aaa3d // bfcvt z29.h, p2/M, z17.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
- "st1w { z16.s }, p1, [x9]\n"
+ ".inst 0xc0040884 // mova za.d[x8, #4], { z4.d-z5.d }\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040885 // mova za.d[x8, #5], { z4.d-z5.d }\n"
+ ".inst 0xc1adcaf8 // fclamp { z24.s-z27.s }, z23.s, z13.s\n"
+ ".inst 0x648aaa1d // bfcvtnt z29.h, p2/M, z16.s\n"
+ ".inst 0x658aabfe // bfcvt z30.h, p2/M, z31.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "st1w { z24.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z18.s }, p1, [x28]\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
- "st1w { z17.s }, p1, [x25]\n"
+ "st1w { z25.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
- "st1w { z19.s }, p1, [x24]\n"
+ ".inst 0x648aaa1e // bfcvtnt z30.h, p2/M, z16.s\n"
+ "st1w { z27.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 14b\n"
"15:" // Main loop tail
- ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
- ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
- ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
- ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
- ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
- ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
- ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
- ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
- ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
- ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
- "st1w { z16.s }, p1, [x9]\n"
+ ".inst 0xc1291390 // bfdot za.s[x8, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc12f1391 // bfdot za.s[x8, 1], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc1271392 // bfdot za.s[x8, 2], { z28.h-z29.h }, z7.h\n"
+ ".inst 0xc12c1393 // bfdot za.s[x8, 3], { z28.h-z29.h }, z12.h\n"
+ ".inst 0xc12a13b0 // bfdot za.s[x8, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc12813b1 // bfdot za.s[x8, 1], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xc1221394 // bfdot za.s[x8, 4], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xc1211395 // bfdot za.s[x8, 5], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc12613b2 // bfdot za.s[x8, 2], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc12013b3 // bfdot za.s[x8, 3], { z29.h-z30.h }, z0.h\n"
+ ".inst 0xc12e13b4 // bfdot za.s[x8, 4], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc12313b5 // bfdot za.s[x8, 5], { z29.h-z30.h }, z3.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1adcaf8 // fclamp { z24.s-z27.s }, z23.s, z13.s\n"
+ ".inst 0xc0040884 // mova za.d[x8, #4], { z4.d-z5.d }\n"
+ ".inst 0xc0040885 // mova za.d[x8, #5], { z4.d-z5.d }\n"
+ "st1w { z24.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
- "st1w { z18.s }, p1, [x28]\n"
+ "st1w { z26.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
- "st1w { z17.s }, p1, [x25]\n"
+ "st1w { z25.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
- "st1w { z19.s }, p1, [x24]\n"
+ "st1w { z27.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
- ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
- ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"16:" // Main loop skip tail
"cbz x11, 18f\n"
"17:" // Right padding loop
@@ -512,25 +512,25 @@ void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
"subs x11, x11, #0x1\n"
".inst 0xc006083e // mova { z30.d-z31.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1adcb3c // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+ ".inst 0xc0040884 // mova za.d[x8, #4], { z4.d-z5.d }\n"
+ ".inst 0xc0040885 // mova za.d[x8, #5], { z4.d-z5.d }\n"
+ ".inst 0xc1adcafc // fclamp { z28.s-z31.s }, z23.s, z13.s\n"
"st1w { z28.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z30.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
- ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
"st1w { z29.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
"st1w { z31.s }, p1, [x24]\n"
"add x24, x24, x22, LSL #2\n"
"bgt 17b\n"
"18:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
index a3b9ca402a..7298a88814 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,85 +72,85 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
"mov x20, #0x9\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x7\n"
"ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
"ptrue p2.b\n"
- "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x20, x20, x7\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p1.s, XZR, x16\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
"whilelt p8.s, XZR, x17\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z24.s, #0x0\n"
+ "fmov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x21\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- "incb x21\n"
- "ld1w { z23.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
- "ld1w { z6.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- ".inst 0x648aaaee // bfcvtnt z14.h, p2/M, z23.s\n"
- "incb x21\n"
- "ld1w { z28.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
- ".inst 0x658aa8c3 // bfcvt z3.h, p2/M, z6.s\n"
- ".inst 0x658aab88 // bfcvt z8.h, p2/M, z28.s\n"
- "ld1w { z10.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #3\n"
+ "ldr x25, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x24, #0x9\n"
+ "add x20, x17, x7\n"
+ "mov z21.d, z20.d\n"
"ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
- ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
- "ld1w { z2.s }, p2/Z, [x20]\n"
- "mov x21, x21\n"
- ".inst 0x658aa847 // bfcvt z7.h, p2/M, z2.s\n"
+ "lsl x23, %x[ld_in_row], #0x2\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
"ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z9.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
- "sub x20, x14, #0x1\n"
- "ld1w { z6.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #3\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- "mov z25.d, z24.d\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "orr x23, x16, x23, LSL #20\n"
- "mov x22, #0x9\n"
- "mov z26.d, z24.d\n"
- "add x21, x17, x7\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "mov z27.d, z24.d\n"
- ".inst 0x648aa8c0 // bfcvtnt z0.h, p2/M, z6.s\n"
- ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
"mov x8, #0x0\n"
+ "sub x24, x24, x20\n"
+ "mov x22, x25\n"
+ "incb x25\n"
"ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x23, x23, #0x2\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x17, x13\n"
+ "ld1w { z5.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #3\n"
+ "sub x20, x14, #0x1\n"
+ "ld1w { z14.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #3\n"
+ "orr x21, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z29.s }, p2/Z, [x22]\n"
+ "mov x20, x25\n"
+ "incb x25\n"
+ ".inst 0x658aa8ab // bfcvt z11.h, p2/M, z5.s\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "orr x21, x16, x21, LSL #20\n"
+ "lsl x21, x21, #0x2\n"
+ "madd x23, x23, x17, x13\n"
+ ".inst 0x658aaba9 // bfcvt z9.h, p2/M, z29.s\n"
+ ".inst 0x658aab84 // bfcvt z4.h, p2/M, z28.s\n"
+ ".inst 0x648aa9cb // bfcvtnt z11.h, p2/M, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "mov x20, x25\n"
+ "ld1w { z16.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x648aaa64 // bfcvtnt z4.h, p2/M, z19.s\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa985 // bfcvt z5.h, p2/M, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ ".inst 0x658aa9e6 // bfcvt z6.h, p2/M, z15.s\n"
+ ".inst 0x648aa86a // bfcvtnt z10.h, p2/M, z3.s\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x24, x24, #0x1\n"
+ ".inst 0xf8b54afc // rprfm pldstrm, x21, [x23]\n"
+ "add x23, x23, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x13, x17, x20, x13\n"
- ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x13, x17, x21, x13\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
"ldp x10, x9, [x23], #0x10\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ldp x28, x27, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x26, x25, [x23], #0x10\n"
"ldp x24, x23, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -162,9 +162,9 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
"lsr x21, x21, #0x1\n"
"sub x11, x11, x21\n"
+ ".inst 0xc1adcb10 // fclamp { z16.s-z19.s }, z24.s, z13.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
"st1w { z16.s }, p1, [x10]\n"
@@ -185,176 +185,176 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 2 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa53 // bfcvt z19.h, p2/M, z18.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaaf4 // bfcvt z20.h, p2/M, z23.s\n"
- "ld1w { z2.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa854 // bfcvtnt z20.h, p2/M, z2.s\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaad5 // bfcvtnt z21.h, p2/M, z22.s\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa5f // bfcvt z31.h, p2/M, z18.s\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aabd6 // bfcvt z22.h, p2/M, z30.s\n"
- "ld1w { z12.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa996 // bfcvtnt z22.h, p2/M, z12.s\n"
- ".inst 0xc13e1270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
- ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
- ".inst 0xc1331290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+ ".inst 0x648aa91e // bfcvtnt z30.h, p2/M, z8.s\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa9c1 // bfcvt z1.h, p2/M, z14.s\n"
+ ".inst 0x648aab5f // bfcvtnt z31.h, p2/M, z26.s\n"
+ ".inst 0x648aa860 // bfcvtnt z0.h, p2/M, z3.s\n"
+ ".inst 0x658aa902 // bfcvt z2.h, p2/M, z8.s\n"
+ ".inst 0x648aab61 // bfcvtnt z1.h, p2/M, z27.s\n"
+ ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
"7:" // Unpadded: 1 priming loads
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "ld1w { z31.s }, p1/Z, [x13]\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaad0 // bfcvtnt z16.h, p2/M, z22.s\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab91 // bfcvt z17.h, p2/M, z28.s\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
- "ld1w { z2.s }, p1/Z, [x20]\n"
+ ".inst 0x658aabf9 // bfcvt z25.h, p2/M, z31.s\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa852 // bfcvt z18.h, p2/M, z2.s\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
- "ld1w { z2.s }, p1/Z, [x20]\n"
+ ".inst 0x658aaa7a // bfcvt z26.h, p2/M, z19.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa853 // bfcvt z19.h, p2/M, z2.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ ".inst 0x658aabbb // bfcvt z27.h, p2/M, z29.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
- ".inst 0xc1381210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
- "ld1w { z10.s }, p1/Z, [x20]\n"
- ".inst 0x658aa954 // bfcvt z20.h, p2/M, z10.s\n"
- ".inst 0xc1371230 // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
+ ".inst 0x648aa819 // bfcvtnt z25.h, p2/M, z0.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa99c // bfcvt z28.h, p2/M, z12.s\n"
+ ".inst 0x648aa91a // bfcvtnt z26.h, p2/M, z8.s\n"
+ ".inst 0x648aa87b // bfcvtnt z27.h, p2/M, z3.s\n"
+ ".inst 0x658aa8fd // bfcvt z29.h, p2/M, z7.s\n"
+ ".inst 0x648aaa7c // bfcvtnt z28.h, p2/M, z19.s\n"
+ ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ ".inst 0xc1351350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z5.h\n"
"8:" // Unpadded: 0 priming loads
"cmp x14, #0x2\n"
"blt 16f\n"
"add x21, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
"sub x14, x14, #0x2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x11, x11, #0x1\n"
- ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"lsr x20, x14, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
"cmp x20, x11\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
"csel x22, x20, x11, LT\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
"and x14, x14, #0x1\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa31 // bfcvt z17.h, p2/M, z17.s\n"
+ ".inst 0x648aaa4f // bfcvtnt z15.h, p2/M, z18.s\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
"sub x11, x11, x22\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0x658aab52 // bfcvt z18.h, p2/M, z26.s\n"
+ ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
+ ".inst 0x648aabd1 // bfcvtnt z17.h, p2/M, z30.s\n"
+ ".inst 0x658aa813 // bfcvt z19.h, p2/M, z0.s\n"
+ ".inst 0x648aabb2 // bfcvtnt z18.h, p2/M, z29.s\n"
"cbz x22, 15f\n"
"9:" // Unpadded: Main loop
"add x21, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x13]\n"
- ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
+ ".inst 0xc13a11f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z10.h\n"
"add x13, x13, %x[ld_in_col], LSL #2\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "ld1w { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ ".inst 0xc13b11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z11.h\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x21]\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
- ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ "subs x22, x22, #0x1\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aabcb // bfcvt z11.h, p2/M, z30.s\n"
- ".inst 0x648aa9e9 // bfcvtnt z9.h, p2/M, z15.s\n"
- "ld1w { z19.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa9dc // bfcvt z28.h, p2/M, z14.s\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
- ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ "ld1w { z8.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "ld1w { z2.s }, p1/Z, [x21]\n"
+ ".inst 0xc1391211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z9.h\n"
+ ".inst 0x658aab3d // bfcvt z29.h, p2/M, z25.s\n"
+ "ld1w { z7.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa84c // bfcvtnt z12.h, p2/M, z2.s\n"
+ ".inst 0x658aa85e // bfcvt z30.h, p2/M, z2.s\n"
+ "ld1w { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab5c // bfcvtnt z28.h, p2/M, z26.s\n"
+ "ld1w { z15.s }, p1/Z, [x13]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- "ld1w { z29.s }, p1/Z, [x13]\n"
- ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
- ".inst 0x658aaba9 // bfcvt z9.h, p2/M, z29.s\n"
- "subs x22, x22, #0x1\n"
- "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aa8ff // bfcvt z31.h, p2/M, z7.s\n"
+ ".inst 0x648aab7d // bfcvtnt z29.h, p2/M, z27.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ ".inst 0x648aa91e // bfcvtnt z30.h, p2/M, z8.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1adcb10 // fclamp { z16.s-z19.s }, z24.s, z13.s\n"
+ ".inst 0x648aa83f // bfcvtnt z31.h, p2/M, z1.s\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab20 // bfcvt z0.h, p2/M, z25.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
"st1w { z16.s }, p1, [x10]\n"
- "ld1w { z16.s }, p1/Z, [x21]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
"add x10, x10, x28, LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab8a // bfcvt z10.h, p2/M, z28.s\n"
+ ".inst 0xc1341390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z4.h\n"
+ ".inst 0x658aa9ef // bfcvt z15.h, p2/M, z15.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
"st1w { z17.s }, p1, [x9]\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z18.s }, p1, [x26]\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
"add x26, x26, x24, LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
"st1w { z19.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaac9 // bfcvtnt z9.h, p2/M, z22.s\n"
- ".inst 0x648aabea // bfcvtnt z10.h, p2/M, z31.s\n"
- "ld1w { z31.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aabed // bfcvt z13.h, p2/M, z31.s\n"
+ ".inst 0xc13513b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z5.h\n"
+ ".inst 0x658aa850 // bfcvt z16.h, p2/M, z2.s\n"
+ ".inst 0x658aa911 // bfcvt z17.h, p2/M, z8.s\n"
+ ".inst 0x658aa832 // bfcvt z18.h, p2/M, z1.s\n"
+ ".inst 0x648aa8ef // bfcvtnt z15.h, p2/M, z7.s\n"
+ ".inst 0x658aa993 // bfcvt z19.h, p2/M, z12.s\n"
+ ".inst 0x648aab50 // bfcvtnt z16.h, p2/M, z26.s\n"
+ ".inst 0x648aab71 // bfcvtnt z17.h, p2/M, z27.s\n"
+ ".inst 0x648aab32 // bfcvtnt z18.h, p2/M, z25.s\n"
"bgt 9b\n"
"b 15f\n"
"10:" // Padded
@@ -364,350 +364,350 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"beq 12f\n"
"11:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- ".inst 0x658aa98a // bfcvt z10.h, p2/M, z12.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa3e // bfcvt z30.h, p2/M, z17.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z12.s }, p0/Z, [x20]\n"
- ".inst 0x648aa98a // bfcvtnt z10.h, p2/M, z12.s\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa1e // bfcvtnt z30.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x648aa99f // bfcvtnt z31.h, p2/M, z12.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaba0 // bfcvt z0.h, p2/M, z29.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa20 // bfcvtnt z0.h, p2/M, z17.s\n"
+ ".inst 0x658aaa01 // bfcvt z1.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- ".inst 0xc13e1130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1331150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+ ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
+ ".inst 0x658aaa02 // bfcvt z2.h, p2/M, z16.s\n"
+ ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
"12:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9b1 // bfcvt z17.h, p2/M, z13.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa991 // bfcvtnt z17.h, p2/M, z12.s\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "ld1w { z3.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa932 // bfcvt z18.h, p2/M, z9.s\n"
+ ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z11.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aa871 // bfcvtnt z17.h, p2/M, z3.s\n"
+ ".inst 0x658aabd2 // bfcvt z18.h, p2/M, z30.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- ".inst 0x658aaab3 // bfcvt z19.h, p2/M, z21.s\n"
- ".inst 0xc13811f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1371210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
+ ".inst 0x648aabf2 // bfcvtnt z18.h, p2/M, z31.s\n"
+ ".inst 0x658aa833 // bfcvt z19.h, p2/M, z1.s\n"
+ ".inst 0xc13411f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z4.h\n"
+ ".inst 0xc1351210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z5.h\n"
"13:" // Padded: 0 priming loads
"cmp x14, #0x2\n"
"blt 16f\n"
"mov x12, #0x0\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x14, x14, #0x2\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "cmp x20, x11\n"
+ "and x14, x14, #0x1\n"
"ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x22, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa51 // bfcvt z17.h, p2/M, z18.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x8\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa71 // bfcvtnt z17.h, p2/M, z19.s\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
- "sub x14, x14, #0x2\n"
- "sub x11, x11, #0x1\n"
- "lsr x20, x14, #0x1\n"
- "cmp x20, x11\n"
- "csel x21, x20, x11, LT\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "and x14, x14, #0x1\n"
- "sub x11, x11, x21\n"
- "cbz x21, 15f\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa52 // bfcvt z18.h, p2/M, z18.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab32 // bfcvtnt z18.h, p2/M, z25.s\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ "cbz x22, 15f\n"
"14:" // Padded: Main loop
"mov x12, #0x0\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a11f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z10.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z18.s }, p0/Z, [x13]\n"
- ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z11.h\n"
+ "subs x22, x22, #0x1\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z9.h\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab39 // bfcvt z25.h, p2/M, z25.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa49 // bfcvt z9.h, p2/M, z18.s\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ ".inst 0x648aaa19 // bfcvtnt z25.h, p2/M, z16.s\n"
+ ".inst 0xc1adcb00 // fclamp { z0.s-z3.s }, z24.s, z13.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z2.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa84b // bfcvt z11.h, p2/M, z2.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabba // bfcvt z26.h, p2/M, z29.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa29 // bfcvtnt z9.h, p2/M, z17.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z0.s }, p1, [x10]\n"
+ ".inst 0x648aaa3a // bfcvtnt z26.h, p2/M, z17.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z2.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z28.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab8c // bfcvt z12.h, p2/M, z28.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa1b // bfcvt z27.h, p2/M, z16.s\n"
+ "st1w { z3.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa6a // bfcvtnt z10.h, p2/M, z19.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa3b // bfcvtnt z27.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z13.s }, p0/Z, [x20]\n"
- ".inst 0x648aa9eb // bfcvtnt z11.h, p2/M, z15.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa1c // bfcvt z28.h, p2/M, z16.s\n"
"mov x12, #0x0\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x13]\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
+ ".inst 0x648aaa3c // bfcvtnt z28.h, p2/M, z17.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z21.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0x658aaa1d // bfcvt z29.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
"ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0x658aaa4f // bfcvt z15.h, p2/M, z18.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x4\n"
- "ld1w { z20.s }, p0/Z, [x20]\n"
- "st1w { z29.s }, p1, [x9]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z5.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "st1w { z30.s }, p1, [x26]\n"
- "add x8, x8, #0x1\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2a // bfcvt z10.h, p2/M, z17.s\n"
+ ".inst 0x658aaa51 // bfcvt z17.h, p2/M, z18.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z17.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa6b // bfcvt z11.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa71 // bfcvtnt z17.h, p2/M, z19.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "subs x21, x21, #0x1\n"
- "add x10, x10, x28, LSL #2\n"
- "st1w { z31.s }, p1, [x25]\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x9, x9, x27, LSL #2\n"
- "add x26, x26, x24, LSL #2\n"
- ".inst 0x648aaaa9 // bfcvtnt z9.h, p2/M, z21.s\n"
- ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
- "add x25, x25, x23, LSL #2\n"
- ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
- ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa52 // bfcvt z18.h, p2/M, z18.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab32 // bfcvtnt z18.h, p2/M, z25.s\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
"bgt 14b\n"
"15:" // Main loop tail
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z17.s }, p0/Z, [x13]\n"
- ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
"add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a11f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z10.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13b11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z11.h\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z2.s }, p0/Z, [x20]\n"
- ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1391211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z9.h\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab2f // bfcvt z15.h, p2/M, z25.s\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z23.s }, p0/Z, [x20]\n"
- ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa32 // bfcvt z18.h, p2/M, z17.s\n"
- ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ ".inst 0xc1adcb00 // fclamp { z0.s-z3.s }, z24.s, z13.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0x658aab90 // bfcvt z16.h, p2/M, z28.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa852 // bfcvtnt z18.h, p2/M, z2.s\n"
+ "st1w { z0.s }, p1, [x10]\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z2.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ ".inst 0x658aaa51 // bfcvt z17.h, p2/M, z18.s\n"
+ "st1w { z3.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa71 // bfcvtnt z17.h, p2/M, z19.s\n"
+ ".inst 0x658aaa52 // bfcvt z18.h, p2/M, z18.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0x648aa9f4 // bfcvtnt z20.h, p2/M, z15.s\n"
- ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
- ".inst 0xc1381250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
- ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
- "st1w { z28.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
- "st1w { z29.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
- "add x13, x13, %x[ld_in_col], LSL #2\n"
- "st1w { z30.s }, p1, [x26]\n"
- "add x26, x26, x24, LSL #2\n"
- ".inst 0xc1371270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
- "st1w { z31.s }, p1, [x25]\n"
- "add x25, x25, x23, LSL #2\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab32 // bfcvtnt z18.h, p2/M, z25.s\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ ".inst 0xc13411f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z4.h\n"
+ ".inst 0xc1351210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z5.h\n"
"16:" // Main loop skip tail
"cbz x14, 17f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x11, x11, #0x1\n"
"ld1w { z16.s }, p0/Z, [x13]\n"
- ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
- "add x20, x13, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z2.s }, p0/Z, [x20]\n"
- ".inst 0x648aa850 // bfcvtnt z16.h, p2/M, z2.s\n"
- "mov x12, #0x4\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z10.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa951 // bfcvt z17.h, p2/M, z10.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aabd1 // bfcvtnt z17.h, p2/M, z30.s\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa72 // bfcvt z18.h, p2/M, z19.s\n"
+ ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
+ "ld1w { z0.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aa911 // bfcvtnt z17.h, p2/M, z8.s\n"
+ ".inst 0x658aa812 // bfcvt z18.h, p2/M, z0.s\n"
+ "ld1w { z7.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z19.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
- ".inst 0xc13011f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
- "sub x11, x11, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa8f2 // bfcvtnt z18.h, p2/M, z7.s\n"
+ ".inst 0x658aab93 // bfcvt z19.h, p2/M, z28.s\n"
+ ".inst 0xc13a11f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z10.h\n"
+ ".inst 0xc13b11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z11.h\n"
".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
- ".inst 0xc13e11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+ ".inst 0xc1391211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z9.h\n"
".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
- ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ ".inst 0xc1adcb08 // fclamp { z8.s-z11.s }, z24.s, z13.s\n"
"st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc1331211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
- "add x8, x8, #0x1\n"
"st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z10.s }, p1, [x26]\n"
"add x26, x26, x24, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"st1w { z11.s }, p1, [x25]\n"
"add x25, x25, x23, LSL #2\n"
"17:" // Tail input: End
@@ -716,10 +716,10 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x11, x11, #0x1\n"
- ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ ".inst 0xc1adcb08 // fclamp { z8.s-z11.s }, z24.s, z13.s\n"
"st1w { z8.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"st1w { z9.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"st1w { z10.s }, p1, [x26]\n"
@@ -729,11 +729,11 @@ void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
"bgt 18b\n"
"19:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x15\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x15, x16\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
index b72042558d..0b6239a5a4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,237 +69,242 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
__asm__ __volatile__(
- "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x20, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x8\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x4\n"
- "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x22, #0x8\n"
"ptrue p2.b\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x7\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
- "whilelt p8.s, XZR, x6\n"
- "addvl SP, SP, #-30\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z8.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
"ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-30\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x21\n"
+ "whilelt p8.s, XZR, x5\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
- "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z30.s, #0x0\n"
- "cbz x21, 2f\n"
- "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z18.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x21\n"
- "ld1w { z12.s }, p2/Z, [x20]\n"
+ "ldr x28, [%x[args], %[offsetof_Args_weights]]\n"
+ "fmov z27.s, #0x0\n"
+ "addvl x27, SP, #30\n"
+ "mov x26, #0x8\n"
+ "addvl x27, x27, #-6\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "add x24, x5, x6\n"
+ "mov z19.d, z18.d\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x23, %x[ld_in_row], #0x2\n"
+ "mov x11, #0x0\n"
+ "mov x22, x28\n"
+ "incb x28\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ld1w { z23.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "sub x20, x25, #0x1\n"
+ "ld1w { z30.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "orr x21, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z25.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "orr x21, x7, x21, LSL #20\n"
+ "ld1w { z26.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ ".inst 0x658aaaef // bfcvt z15.h, p2/M, z23.s\n"
+ ".inst 0x648aaafb // bfcvtnt z27.h, p2/M, z23.s\n"
+ "ld1w { z16.s }, p2/Z, [x22]\n"
+ ".inst 0x658aabc4 // bfcvt z4.h, p2/M, z30.s\n"
+ "mov x20, x28\n"
+ "incb x28\n"
+ ".inst 0x658aab21 // bfcvt z1.h, p2/M, z25.s\n"
+ "ld1w { z22.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "ld1w { z24.s }, p2/Z, [x20]\n"
+ "mov x8, #0x8\n"
+ ".inst 0x658aab4c // bfcvt z12.h, p2/M, z26.s\n"
+ "lsl x21, x21, #0x2\n"
+ "sub x26, x26, x24\n"
+ "st1h { z27.h }, p2, [x27]\n"
+ ".inst 0x648aabcf // bfcvtnt z15.h, p2/M, z30.s\n"
+ "ld1w { z7.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "fmov z11.s, #0x0\n"
- "incb x21\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
+ "fmov z17.s, #0x0\n"
+ ".inst 0x648aab24 // bfcvtnt z4.h, p2/M, z25.s\n"
+ "ld1w { z0.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aab41 // bfcvtnt z1.h, p2/M, z26.s\n"
+ "ld1w { z21.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa99a // bfcvt z26.h, p2/M, z12.s\n"
- ".inst 0x658aab10 // bfcvt z16.h, p2/M, z24.s\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
"ld1w { z20.s }, p2/Z, [x20]\n"
+ "mov x20, x28\n"
+ "incb x28\n"
+ "st1h { z15.h }, p2, [x27, #1, MUL VL]\n"
+ ".inst 0x658aaadf // bfcvt z31.h, p2/M, z22.s\n"
+ "madd x23, x23, x5, x16\n"
+ "st1h { z4.h }, p2, [x27, #2, MUL VL]\n"
+ ".inst 0x658aa8fc // bfcvt z28.h, p2/M, z7.s\n"
+ ".inst 0x648aaad1 // bfcvtnt z17.h, p2/M, z22.s\n"
+ "st1h { z1.h }, p2, [x27, #3, MUL VL]\n"
+ ".inst 0x658aa81b // bfcvt z27.h, p2/M, z0.s\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "addvl x24, SP, #30\n"
- ".inst 0x648aa98b // bfcvtnt z11.h, p2/M, z12.s\n"
- "ld1w { z25.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- ".inst 0x658aa875 // bfcvt z21.h, p2/M, z3.s\n"
- "addvl x24, x24, #-6\n"
- "ld1w { z6.s }, p2/Z, [x20]\n"
- ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ "st1h { z12.h }, p2, [x27, #4, MUL VL]\n"
+ ".inst 0x658aaaaf // bfcvt z15.h, p2/M, z21.s\n"
+ "st1h { z14.h }, p2, [x27, #5, MUL VL]\n"
+ "addvl x27, x27, #-6\n"
+ ".inst 0x648aa8ff // bfcvtnt z31.h, p2/M, z7.s\n"
+ "ld1w { z26.s }, p2/Z, [x20]\n"
+ "st1h { z17.h }, p2, [x27]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x24]\n"
- ".inst 0x648aab1a // bfcvtnt z26.h, p2/M, z24.s\n"
- "ld1w { z14.s }, p2/Z, [x20]\n"
+ "fmov z25.s, #0x0\n"
+ ".inst 0x648aa81c // bfcvtnt z28.h, p2/M, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "fmov z11.s, #0x0\n"
- "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aa870 // bfcvtnt z16.h, p2/M, z3.s\n"
- "ld1w { z19.s }, p2/Z, [x20]\n"
+ ".inst 0x648aaabb // bfcvtnt z27.h, p2/M, z21.s\n"
+ ".inst 0x658aaa83 // bfcvt z3.h, p2/M, z20.s\n"
+ "ld1w { z4.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa8c9 // bfcvt z9.h, p2/M, z6.s\n"
- ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
- "incb x21\n"
- "ld1w { z12.s }, p2/Z, [x20]\n"
+ ".inst 0x648aaa8f // bfcvtnt z15.h, p2/M, z20.s\n"
+ "st1h { z31.h }, p2, [x27, #1, MUL VL]\n"
+ ".inst 0x658aa92c // bfcvt z12.h, p2/M, z9.s\n"
+ "ld1w { z0.s }, p2/Z, [x20]\n"
+ "mov x20, x28\n"
+ "st1h { z28.h }, p2, [x27, #2, MUL VL]\n"
+ ".inst 0x658aab57 // bfcvt z23.h, p2/M, z26.s\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ "incb x28\n"
+ "st1h { z27.h }, p2, [x27, #3, MUL VL]\n"
+ ".inst 0x658aa83e // bfcvt z30.h, p2/M, z1.s\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aab3b // bfcvtnt z27.h, p2/M, z25.s\n"
- ".inst 0x658aab37 // bfcvt z23.h, p2/M, z25.s\n"
- "ld1w { z5.s }, p2/Z, [x20]\n"
- ".inst 0x658aa9c8 // bfcvt z8.h, p2/M, z14.s\n"
- "mov x23, x21\n"
- "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aa8cb // bfcvtnt z11.h, p2/M, z6.s\n"
- ".inst 0x658aaa79 // bfcvt z25.h, p2/M, z19.s\n"
- "ld1w { z4.s }, p2/Z, [x23]\n"
- "incb x23, ALL, MUL #5\n"
- "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa9c9 // bfcvtnt z9.h, p2/M, z14.s\n"
- ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
- "incb x21\n"
- "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
- "addvl x24, x24, #-6\n"
- "ld1w { z26.s }, p2/Z, [x23]\n"
- "incb x23, ALL, MUL #5\n"
- "st1h { z11.h }, p2, [x24]\n"
- "fmov z2.s, #0x0\n"
- ".inst 0x648aaa68 // bfcvtnt z8.h, p2/M, z19.s\n"
- "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [x23]\n"
- "incb x23, ALL, MUL #5\n"
- ".inst 0x658aa893 // bfcvt z19.h, p2/M, z4.s\n"
- "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aa999 // bfcvtnt z25.h, p2/M, z12.s\n"
- "ld1w { z7.s }, p2/Z, [x23]\n"
- "incb x23, ALL, MUL #5\n"
- ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
- ".inst 0x648aa8b1 // bfcvtnt z17.h, p2/M, z5.s\n"
- "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- ".inst 0x658aa8ab // bfcvt z11.h, p2/M, z5.s\n"
- "ld1w { z18.s }, p2/Z, [x23]\n"
- "mov x20, x21\n"
- ".inst 0x648aa882 // bfcvtnt z2.h, p2/M, z4.s\n"
- ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
- "ld1w { z15.s }, p2/Z, [x20]\n"
+ "st1h { z15.h }, p2, [x27, #4, MUL VL]\n"
+ ".inst 0x658aa894 // bfcvt z20.h, p2/M, z4.s\n"
+ "st1h { z3.h }, p2, [x27, #5, MUL VL]\n"
+ "addvl x27, x27, #-6\n"
+ ".inst 0x648aab4c // bfcvtnt z12.h, p2/M, z26.s\n"
+ "ld1w { z7.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
- "addvl x24, x24, #-6\n"
- ".inst 0x648aab53 // bfcvtnt z19.h, p2/M, z26.s\n"
- ".inst 0x658aa8fa // bfcvt z26.h, p2/M, z7.s\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
+ "st1h { z25.h }, p2, [x27]\n"
+ ".inst 0x648aa837 // bfcvtnt z23.h, p2/M, z1.s\n"
+ "fmov z22.s, #0x0\n"
+ "ld1w { z25.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x24]\n"
- ".inst 0x648aab6e // bfcvtnt z14.h, p2/M, z27.s\n"
- "ld1w { z4.s }, p2/Z, [x20]\n"
- "fmov z21.s, #0x0\n"
- "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aa89e // bfcvtnt z30.h, p2/M, z4.s\n"
+ ".inst 0x658aa810 // bfcvt z16.h, p2/M, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aa814 // bfcvtnt z20.h, p2/M, z0.s\n"
+ "st1h { z12.h }, p2, [x27, #1, MUL VL]\n"
+ ".inst 0x658aa964 // bfcvt z4.h, p2/M, z11.s\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "mov x20, x28\n"
+ "st1h { z23.h }, p2, [x27, #2, MUL VL]\n"
+ ".inst 0x658aa8e9 // bfcvt z9.h, p2/M, z7.s\n"
+ ".inst 0x648aa976 // bfcvtnt z22.h, p2/M, z11.s\n"
+ "st1h { z30.h }, p2, [x27, #3, MUL VL]\n"
+ ".inst 0x658aab3c // bfcvt z28.h, p2/M, z25.s\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa9ea // bfcvt z10.h, p2/M, z15.s\n"
- "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x648aa8e6 // bfcvtnt z6.h, p2/M, z7.s\n"
- "incb x21\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x27, #4, MUL VL]\n"
+ ".inst 0x658aa861 // bfcvt z1.h, p2/M, z3.s\n"
+ ".inst 0x648aa8e4 // bfcvtnt z4.h, p2/M, z7.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa973 // bfcvt z19.h, p2/M, z11.s\n"
- "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x648aaa5a // bfcvtnt z26.h, p2/M, z18.s\n"
- ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
+ "st1h { z16.h }, p2, [x27, #5, MUL VL]\n"
+ ".inst 0x648aab29 // bfcvtnt z9.h, p2/M, z25.s\n"
"ld1w { z12.s }, p2/Z, [x20]\n"
- "mov x21, x21\n"
- ".inst 0x658aa897 // bfcvt z23.h, p2/M, z4.s\n"
- ".inst 0x648aa9f5 // bfcvtnt z21.h, p2/M, z15.s\n"
- "ld1w { z24.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aa96a // bfcvtnt z10.h, p2/M, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x648aa893 // bfcvtnt z19.h, p2/M, z4.s\n"
- ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
- "ld1w { z2.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa37 // bfcvtnt z23.h, p2/M, z17.s\n"
- "ld1w { z26.s }, p2/Z, [x21]\n"
- "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
- "addvl x24, x24, #-6\n"
- "st1h { z21.h }, p2, [x24]\n"
- ".inst 0x648aa990 // bfcvtnt z16.h, p2/M, z12.s\n"
- "incb x21, ALL, MUL #5\n"
- "fmov z8.s, #0x0\n"
- "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aab04 // bfcvt z4.h, p2/M, z24.s\n"
- ".inst 0x658aa985 // bfcvt z5.h, p2/M, z12.s\n"
- "sub x20, x25, #0x1\n"
- "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aa871 // bfcvt z17.h, p2/M, z3.s\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
- ".inst 0x658aa857 // bfcvt z23.h, p2/M, z2.s\n"
- "orr x23, x7, x23, LSL #20\n"
- "mov x22, #0x8\n"
- "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
- ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
- "add x21, x6, x4\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
- "addvl x24, x24, #-6\n"
- "mov z31.d, z30.d\n"
- ".inst 0x648aab08 // bfcvtnt z8.h, p2/M, z24.s\n"
- "st1h { z8.h }, p2, [x24]\n"
- ".inst 0x648aa864 // bfcvtnt z4.h, p2/M, z3.s\n"
- ".inst 0x648aa851 // bfcvtnt z17.h, p2/M, z2.s\n"
- "mov x11, #0x0\n"
- "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x648aab57 // bfcvtnt z23.h, p2/M, z26.s\n"
- ".inst 0x648aab2e // bfcvtnt z14.h, p2/M, z25.s\n"
- "mov x8, #0x8\n"
- "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
- ".inst 0x658aab26 // bfcvt z6.h, p2/M, z25.s\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x23, x23, #0x2\n"
- "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x6, x16\n"
- "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
- "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
+ "incb x20, ALL, MUL #5\n"
+ "addvl x27, x27, #-6\n"
+ ".inst 0x648aa87c // bfcvtnt z28.h, p2/M, z3.s\n"
+ "ld1w { z27.s }, p2/Z, [x20]\n"
+ "st1h { z22.h }, p2, [x27]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aab01 // bfcvtnt z1.h, p2/M, z24.s\n"
+ "fmov z17.s, #0x0\n"
+ "st1h { z4.h }, p2, [x27, #1, MUL VL]\n"
+ ".inst 0x658aa97e // bfcvt z30.h, p2/M, z11.s\n"
+ ".inst 0x658aab0a // bfcvt z10.h, p2/M, z24.s\n"
+ "st1h { z9.h }, p2, [x27, #2, MUL VL]\n"
+ ".inst 0x658aa84f // bfcvt z15.h, p2/M, z2.s\n"
+ "ld1w { z26.s }, p2/Z, [x20]\n"
+ "st1h { z28.h }, p2, [x27, #3, MUL VL]\n"
+ ".inst 0x658aa98e // bfcvt z14.h, p2/M, z12.s\n"
+ "st1h { z1.h }, p2, [x27, #4, MUL VL]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ ".inst 0x648aa971 // bfcvtnt z17.h, p2/M, z11.s\n"
+ "st1h { z10.h }, p2, [x27, #5, MUL VL]\n"
+ "addvl x27, x27, #-6\n"
+ ".inst 0x648aa85e // bfcvtnt z30.h, p2/M, z2.s\n"
+ ".inst 0x658aab4a // bfcvt z10.h, p2/M, z26.s\n"
+ ".inst 0x648aa98f // bfcvtnt z15.h, p2/M, z12.s\n"
+ ".inst 0x648aab6e // bfcvtnt z14.h, p2/M, z27.s\n"
+ "st1h { z17.h }, p2, [x27]\n"
+ ".inst 0x648aab55 // bfcvtnt z21.h, p2/M, z26.s\n"
+ "st1h { z30.h }, p2, [x27, #1, MUL VL]\n"
+ "st1h { z15.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z14.h }, p2, [x27, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x27, #4, MUL VL]\n"
+ "st1h { z10.h }, p2, [x27, #5, MUL VL]\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xf8b54afc // rprfm pldstrm, x21, [x23]\n"
+ "add x23, x23, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x16, x6, x20, x16\n"
- ".inst 0xc0046bc0 // mova za.d[x11, #0], { z30.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046bc1 // mova za.d[x11, #1], { z30.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
"mov x22, #0x4\n"
- "ldp x14, x13, [x23], #0x10\n"
- ".inst 0xc0046bc2 // mova za.d[x11, #2], { z30.d-z31.d }\n"
- "ldp x5, x10, [x20], #0x10\n"
- ".inst 0xc0046bc3 // mova za.d[x11, #3], { z30.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x5, x21, x16\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046bc4 // mova za.d[x11, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
+ "ldp x2, x10, [x20], #0x10\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
"ldp x9, x28, [x23], #0x10\n"
- ".inst 0xc0046bc5 // mova za.d[x11, #5], { z30.d-z31.d }\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046bc6 // mova za.d[x11, #6], { z30.d-z31.d }\n"
- ".inst 0xc0046bc7 // mova za.d[x11, #7], { z30.d-z31.d }\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
"cbz x21, 5f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
- ".inst 0xc1bccba4 // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
+ ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bdc900 // fclamp { z0.s-z3.s }, z8.s, z29.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z4.s }, p1, [x14]\n"
- "add x14, x14, x5, LSL #2\n"
- "st1w { z6.s }, p1, [x13]\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x2, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "st1w { z5.s }, p1, [x9]\n"
+ "st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- "st1w { z7.s }, p1, [x28]\n"
+ "st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
- "adds XZR, x6, x4\n"
+ "adds XZR, x5, x6\n"
"bne 12f\n"
"cbz x22, 10f\n"
"cmp x22, #0x1\n"
@@ -312,327 +317,327 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"6:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row], LSL #2\n"
"ld1w { z21.s }, p1/Z, [x16]\n"
- ".inst 0x658aaab2 // bfcvt z18.h, p2/M, z21.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z11.s }, p1/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z17.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
- "ld1w { z7.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaab7 // bfcvt z23.h, p2/M, z21.s\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8f4 // bfcvt z20.h, p2/M, z7.s\n"
- "ld1w { z12.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa978 // bfcvt z24.h, p2/M, z11.s\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa994 // bfcvtnt z20.h, p2/M, z12.s\n"
- ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12d7250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x648aa817 // bfcvtnt z23.h, p2/M, z0.s\n"
+ ".inst 0x658aa9b9 // bfcvt z25.h, p2/M, z13.s\n"
"ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8d5 // bfcvt z21.h, p2/M, z6.s\n"
- ".inst 0xc12c7251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
- ".inst 0xa0412a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12b7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
- ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
- ".inst 0xc12a7271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12b7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
- ".inst 0xc12a7291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa8da // bfcvt z26.h, p2/M, z6.s\n"
+ ".inst 0xa1422a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0x648aa9d9 // bfcvtnt z25.h, p2/M, z14.s\n"
+ ".inst 0xc12972f0 // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12172f1 // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0x648aabda // bfcvtnt z26.h, p2/M, z30.s\n"
+ ".inst 0xc12d7310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1257311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc12f7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc1277331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z7.h\n"
"7:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z6.s }, p1/Z, [x16]\n"
- ".inst 0x658aa8d7 // bfcvt z23.h, p2/M, z6.s\n"
+ "ld1w { z31.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1w { z1.s }, p1/Z, [x22]\n"
+ "ld1w { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa837 // bfcvtnt z23.h, p2/M, z1.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z15.s }, p1/Z, [x22]\n"
+ "ld1w { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9f8 // bfcvt z24.h, p2/M, z15.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
- "ld1w { z1.s }, p1/Z, [x22]\n"
+ ".inst 0x658aabfe // bfcvt z30.h, p2/M, z31.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
- "ld1w { z9.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaabf // bfcvt z31.h, p2/M, z21.s\n"
+ "ld1w { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12972f0 // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x648aa9fe // bfcvtnt z30.h, p2/M, z15.s\n"
+ ".inst 0x658aaa80 // bfcvt z0.h, p2/M, z20.s\n"
+ "ld1w { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
- ".inst 0xc12172f1 // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1412aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0x648aaaff // bfcvtnt z31.h, p2/M, z23.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa9e1 // bfcvt z1.h, p2/M, z15.s\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0x648aa940 // bfcvtnt z0.h, p2/M, z10.s\n"
+ ".inst 0xc12e73d0 // bfdot za.s[x11, 0], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc12673d1 // bfdot za.s[x11, 1], { z30.h-z31.h }, z6.h\n"
".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f72f2 // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
- ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12772f3 // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
- "ld1w { z16.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa1a // bfcvtnt z26.h, p2/M, z16.s\n"
- ".inst 0xc1297310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
- ".inst 0xc1217311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
- ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12f7312 // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
- ".inst 0xc1277313 // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
- ".inst 0xc12b7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
- ".inst 0xc1237331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
- ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1237332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
- ".inst 0xc1227333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
+ ".inst 0x648aa881 // bfcvtnt z1.h, p2/M, z4.s\n"
+ ".inst 0xc12f73d2 // bfdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc12773d3 // bfdot za.s[x11, 3], { z30.h-z31.h }, z7.h\n"
+ ".inst 0xc12b73f0 // bfdot za.s[x11, 0], { z31.h-z0.h }, z11.h\n"
+ ".inst 0xc12373f1 // bfdot za.s[x11, 1], { z31.h-z0.h }, z3.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f73f2 // bfdot za.s[x11, 2], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12e73f3 // bfdot za.s[x11, 3], { z31.h-z0.h }, z14.h\n"
+ ".inst 0xc12d7010 // bfdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xc1257011 // bfdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12a7012 // bfdot za.s[x11, 2], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc1227013 // bfdot za.s[x11, 3], { z0.h-z1.h }, z2.h\n"
"8:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z24.s }, p1/Z, [x16]\n"
- ".inst 0x658aab02 // bfcvt z2.h, p2/M, z24.s\n"
+ "ld1w { z15.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa02 // bfcvtnt z2.h, p2/M, z16.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z16.s }, p1/Z, [x23]\n"
+ "ld1w { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa03 // bfcvtnt z3.h, p2/M, z16.s\n"
+ ".inst 0x658aa9e2 // bfcvt z2.h, p2/M, z15.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z10.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabe3 // bfcvt z3.h, p2/M, z31.s\n"
"ld1w { z1.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa824 // bfcvt z4.h, p2/M, z1.s\n"
- "ld1w { z19.s }, p1/Z, [x23]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0x648aaaa2 // bfcvtnt z2.h, p2/M, z21.s\n"
+ ".inst 0x658aa944 // bfcvt z4.h, p2/M, z10.s\n"
+ "ld1w { z13.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa64 // bfcvtnt z4.h, p2/M, z19.s\n"
- ".inst 0xa1402ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0x648aaac3 // bfcvtnt z3.h, p2/M, z22.s\n"
+ "ld1w { z12.s }, p1/Z, [x23]\n"
+ ".inst 0x658aa9a5 // bfcvt z5.h, p2/M, z13.s\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0x648aa824 // bfcvtnt z4.h, p2/M, z1.s\n"
".inst 0xc12f7050 // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
- "ld1w { z0.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa805 // bfcvt z5.h, p2/M, z0.s\n"
- ".inst 0xc1277051 // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
- ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12f7052 // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
- ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1277053 // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
- "ld1w { z10.s }, p1/Z, [x23]\n"
- ".inst 0x648aa945 // bfcvtnt z5.h, p2/M, z10.s\n"
- ".inst 0xc12e7070 // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
- ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12e7051 // bfdot za.s[x11, 1], { z2.h-z3.h }, z14.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x648aa985 // bfcvtnt z5.h, p2/M, z12.s\n"
+ ".inst 0xc1297052 // bfdot za.s[x11, 2], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1217053 // bfdot za.s[x11, 3], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1277070 // bfdot za.s[x11, 0], { z3.h-z4.h }, z7.h\n"
".inst 0xc1267071 // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
- ".inst 0xa0412aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12f7054 // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
- ".inst 0xa1422ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1277055 // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
- ".inst 0xc12d7072 // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
- ".inst 0xc12c7073 // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12d7054 // bfdot za.s[x11, 4], { z2.h-z3.h }, z13.h\n"
+ ".inst 0xc12c7055 // bfdot za.s[x11, 5], { z2.h-z3.h }, z12.h\n"
+ ".inst 0xc12f7072 // bfdot za.s[x11, 2], { z3.h-z4.h }, z15.h\n"
+ ".inst 0xc12e7073 // bfdot za.s[x11, 3], { z3.h-z4.h }, z14.h\n"
".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1287090 // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
- ".inst 0xc1207091 // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
- ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b7090 // bfdot za.s[x11, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc12a7091 // bfdot za.s[x11, 1], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
".inst 0xc12f7074 // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
".inst 0xc12e7075 // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
- ".inst 0xc1277092 // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
- ".inst 0xc1267093 // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
- ".inst 0xa1422a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1287094 // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
- ".inst 0xc1207095 // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
+ ".inst 0xc12b7092 // bfdot za.s[x11, 2], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc12a7093 // bfdot za.s[x11, 3], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12e7094 // bfdot za.s[x11, 4], { z4.h-z5.h }, z14.h\n"
+ ".inst 0xc1267095 // bfdot za.s[x11, 5], { z4.h-z5.h }, z6.h\n"
"9:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z18.s }, p1/Z, [x16]\n"
- ".inst 0x658aaa4c // bfcvt z12.h, p2/M, z18.s\n"
+ "ld1w { z7.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1w { z7.s }, p1/Z, [x24]\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa8ec // bfcvtnt z12.h, p2/M, z7.s\n"
"addvl x22, SP, #12\n"
- "ld1w { z20.s }, p1/Z, [x24]\n"
+ "ld1w { z26.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa8d // bfcvt z13.h, p2/M, z20.s\n"
"addvl x21, SP, #18\n"
- "ld1w { z0.s }, p1/Z, [x24]\n"
+ "ld1w { z23.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa80d // bfcvtnt z13.h, p2/M, z0.s\n"
+ ".inst 0x658aa8ff // bfcvt z31.h, p2/M, z7.s\n"
"addvl x20, SP, #24\n"
- "ld1w { z10.s }, p1/Z, [x24]\n"
+ "ld1w { z7.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa94e // bfcvt z14.h, p2/M, z10.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x24]\n"
+ ".inst 0x658aab40 // bfcvt z0.h, p2/M, z26.s\n"
+ "ld1w { z24.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa80e // bfcvtnt z14.h, p2/M, z0.s\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1217190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
- "ld1w { z17.s }, p1/Z, [x24]\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0x648aab7f // bfcvtnt z31.h, p2/M, z27.s\n"
+ ".inst 0x658aa8e1 // bfcvt z1.h, p2/M, z7.s\n"
+ "ld1w { z30.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
- ".inst 0xc1207191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
- ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12b7192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
- ".inst 0xa0412ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12a7193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
- "ld1w { z18.s }, p1/Z, [x24]\n"
- ".inst 0x648aaa4f // bfcvtnt z15.h, p2/M, z18.s\n"
- ".inst 0xc12171b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12071b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12a7194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc1227195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
- ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
- ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
- ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12b71d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
- ".inst 0xc12a71d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1297196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
- ".inst 0xc1287197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
- ".inst 0xc12171b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
- ".inst 0xc12071b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
- ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12a71d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
- ".inst 0xc12271d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
- ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12b71b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
- ".inst 0xc12371b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
- ".inst 0xc12771d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc12671d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
- ".inst 0xa0422a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12771d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
- ".inst 0xc12671d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xa0412aec // ld1h { z12.h-z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0x648aaae0 // bfcvtnt z0.h, p2/M, z23.s\n"
+ "ld1w { z9.s }, p1/Z, [x24]\n"
+ ".inst 0x658aabc2 // bfcvt z2.h, p2/M, z30.s\n"
+ ".inst 0xa0422ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x648aab01 // bfcvtnt z1.h, p2/M, z24.s\n"
+ ".inst 0xc12e73f0 // bfdot za.s[x11, 0], { z31.h-z0.h }, z14.h\n"
+ ".inst 0xc12673f1 // bfdot za.s[x11, 1], { z31.h-z0.h }, z6.h\n"
+ ".inst 0xa1402ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0x648aa922 // bfcvtnt z2.h, p2/M, z9.s\n"
+ ".inst 0xc12f73f2 // bfdot za.s[x11, 2], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f3 // bfdot za.s[x11, 3], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12d7010 // bfdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xc12c7011 // bfdot za.s[x11, 1], { z0.h-z1.h }, z12.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12e73f4 // bfdot za.s[x11, 4], { z31.h-z0.h }, z14.h\n"
+ ".inst 0xc12673f5 // bfdot za.s[x11, 5], { z31.h-z0.h }, z6.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7012 // bfdot za.s[x11, 2], { z0.h-z1.h }, z15.h\n"
+ ".inst 0xc1277013 // bfdot za.s[x11, 3], { z0.h-z1.h }, z7.h\n"
+ ".inst 0xa0412aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1257030 // bfdot za.s[x11, 0], { z1.h-z2.h }, z5.h\n"
+ ".inst 0xc1247031 // bfdot za.s[x11, 1], { z1.h-z2.h }, z4.h\n"
+ ".inst 0xa0422aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12e73f6 // bfdot za.s[x11, 6], { z31.h-z0.h }, z14.h\n"
+ ".inst 0xc12673f7 // bfdot za.s[x11, 7], { z31.h-z0.h }, z6.h\n"
+ ".inst 0xc12d7014 // bfdot za.s[x11, 4], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xc12c7015 // bfdot za.s[x11, 5], { z0.h-z1.h }, z12.h\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12b7032 // bfdot za.s[x11, 2], { z1.h-z2.h }, z11.h\n"
+ ".inst 0xc12a7033 // bfdot za.s[x11, 3], { z1.h-z2.h }, z10.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc1277016 // bfdot za.s[x11, 6], { z0.h-z1.h }, z7.h\n"
+ ".inst 0xc1267017 // bfdot za.s[x11, 7], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xc12f7034 // bfdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc12e7035 // bfdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12e7036 // bfdot za.s[x11, 6], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc1267037 // bfdot za.s[x11, 7], { z1.h-z2.h }, z6.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 20f\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z1.s }, p1/Z, [x16]\n"
- ".inst 0x658aa834 // bfcvt z20.h, p2/M, z1.s\n"
+ "ld1w { z22.s }, p1/Z, [x16]\n"
"sub x25, x25, #0x1\n"
- "ld1w { z10.s }, p1/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
"sub x15, x15, #0x1\n"
- ".inst 0x648aa954 // bfcvtnt z20.h, p2/M, z10.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"cmp x25, x15\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaade // bfcvt z30.h, p2/M, z22.s\n"
"csel x25, x25, x15, LT\n"
- ".inst 0x648aaa75 // bfcvtnt z21.h, p2/M, z19.s\n"
- "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
"sub x15, x15, x25\n"
- "ld1w { z15.s }, p1/Z, [x20]\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9f7 // bfcvt z23.h, p2/M, z15.s\n"
- "ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ ".inst 0x658aab80 // bfcvt z0.h, p2/M, z28.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa1e // bfcvtnt z30.h, p2/M, z16.s\n"
+ ".inst 0x658aaa21 // bfcvt z1.h, p2/M, z17.s\n"
+ ".inst 0x648aa8ff // bfcvtnt z31.h, p2/M, z7.s\n"
+ ".inst 0x648aa920 // bfcvtnt z0.h, p2/M, z9.s\n"
+ ".inst 0x648aa981 // bfcvtnt z1.h, p2/M, z12.s\n"
"cbz x25, 19f\n"
"11:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc12b73d0 // bfdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
"addvl x23, SP, #12\n"
- "ld1w { z27.s }, p1/Z, [x16]\n"
- ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ "ld1w { z25.s }, p1/Z, [x16]\n"
+ ".inst 0xc12373d1 // bfdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc1297292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
"add x20, x16, %x[ld_in_row], LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1217293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
- ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
- "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xc12f73d2 // bfdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12e73d3 // bfdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12573f0 // bfdot za.s[x11, 0], { z31.h-z0.h }, z5.h\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xc12473f1 // bfdot za.s[x11, 1], { z31.h-z0.h }, z4.h\n"
".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
- "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc12e73d4 // bfdot za.s[x11, 4], { z30.h-z31.h }, z14.h\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xc12673d5 // bfdot za.s[x11, 5], { z30.h-z31.h }, z6.h\n"
".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
- "ld1w { z19.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
- ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
- "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc12f73f2 // bfdot za.s[x11, 2], { z31.h-z0.h }, z15.h\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
- ".inst 0xa1422b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc12773f3 // bfdot za.s[x11, 3], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12a7010 // bfdot za.s[x11, 0], { z0.h-z1.h }, z10.h\n"
"ld1w { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
- ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12e72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xc1227011 // bfdot za.s[x11, 1], { z0.h-z1.h }, z2.h\n"
+ ".inst 0xa1422b06 // ld1h { z6.h, z14.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12d73d6 // bfdot za.s[x11, 6], { z30.h-z31.h }, z13.h\n"
"ld1w { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc12672b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
- ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12f72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc12772d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
- ".inst 0xa1422ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12e72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
- ".inst 0xc12672b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12f72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc12772d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
- ".inst 0xa0422ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12f72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc12e72d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
- ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12c1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
- ".inst 0xc1241291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
- ".inst 0x658aab74 // bfcvt z20.h, p2/M, z27.s\n"
- ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12d12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
- ".inst 0x648aab54 // bfcvtnt z20.h, p2/M, z26.s\n"
- ".inst 0xc12512b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
- ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12912d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
- ".inst 0x648aab15 // bfcvtnt z21.h, p2/M, z24.s\n"
- ".inst 0xc12112d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
- ".inst 0x658aaa76 // bfcvt z22.h, p2/M, z19.s\n"
- ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
+ ".inst 0xc12573d7 // bfdot za.s[x11, 7], { z30.h-z31.h }, z5.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f73f4 // bfdot za.s[x11, 4], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f5 // bfdot za.s[x11, 5], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12e7012 // bfdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267013 // bfdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12f73f6 // bfdot za.s[x11, 6], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f7 // bfdot za.s[x11, 7], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7014 // bfdot za.s[x11, 4], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267015 // bfdot za.s[x11, 5], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1422ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12b7016 // bfdot za.s[x11, 6], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc1237017 // bfdot za.s[x11, 7], { z0.h-z1.h }, z3.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12d13d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z13.h\n"
+ ".inst 0xc12513d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z5.h\n"
+ ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12f13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12713f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z7.h\n"
+ ".inst 0x658aaaff // bfcvt z31.h, p2/M, z23.s\n"
+ ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0x648aab1e // bfcvtnt z30.h, p2/M, z24.s\n"
+ ".inst 0xc12a1010 // bfdot za.s[x8, 0], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc1221011 // bfdot za.s[x8, 1], { z0.h-z1.h }, z2.h\n"
+ ".inst 0x658aaaa0 // bfcvt z0.h, p2/M, z21.s\n"
+ ".inst 0x658aaa21 // bfcvt z1.h, p2/M, z17.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aaa56 // bfcvtnt z22.h, p2/M, z18.s\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1bccba8 // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
- "st1w { z8.s }, p1, [x14]\n"
- "add x14, x14, x5, LSL #2\n"
- "st1w { z10.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
+ ".inst 0xa1422be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aaadf // bfcvtnt z31.h, p2/M, z22.s\n"
+ ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0x648aaa80 // bfcvtnt z0.h, p2/M, z20.s\n"
+ ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
+ ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- "st1w { z9.s }, p1, [x9]\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1bdc90c // fclamp { z12.s-z15.s }, z8.s, z29.s\n"
+ "st1w { z12.s }, p1, [x14]\n"
+ "add x14, x14, x2, LSL #2\n"
+ "st1w { z14.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "st1w { z13.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
- ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
- "st1w { z11.s }, p1, [x28]\n"
+ "st1w { z15.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 11b\n"
"b 19f\n"
@@ -647,450 +652,450 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa06 // bfcvt z6.h, p2/M, z16.s\n"
"add x21, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x16]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa1422a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa06 // bfcvtnt z6.h, p2/M, z16.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x658aaa07 // bfcvt z7.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa07 // bfcvtnt z7.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa08 // bfcvt z8.h, p2/M, z16.s\n"
+ ".inst 0x658aaa2a // bfcvt z10.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x20, SP, #24\n"
- "ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f70d0 // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
- "ld1w { z9.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa2b // bfcvt z11.h, p2/M, z17.s\n"
+ ".inst 0xc12d7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
- ".inst 0xc12e70d1 // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
- ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1257131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x658aaa2c // bfcvt z12.h, p2/M, z17.s\n"
"ld1w { z16.s }, p0/Z, [x21]\n"
- ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
- ".inst 0xc12f70f0 // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
- ".inst 0xc12e70f1 // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
- ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1237110 // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
- ".inst 0xc1227111 // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
+ ".inst 0xc12e7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc1267151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xc12f7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z15.h\n"
+ ".inst 0xc1277171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z7.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "add x22, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa38 // bfcvtnt z24.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x21, SP, #18\n"
+ ".inst 0x658aaa19 // bfcvt z25.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa39 // bfcvtnt z25.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
- "ld1w { z16.s }, p0/Z, [x22]\n"
- "addvl x20, SP, #24\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
+ ".inst 0xc12e7310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z14.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
- ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1267311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z6.h\n"
+ ".inst 0x648aaa3a // bfcvtnt z26.h, p2/M, z17.s\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x658aaa1b // bfcvt z27.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
- ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc12c7312 // bfdot za.s[x11, 2], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1247313 // bfdot za.s[x11, 3], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xc12f7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0x648aaa1b // bfcvtnt z27.h, p2/M, z16.s\n"
+ ".inst 0xc1277331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z7.h\n"
".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc12e7153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
- ".inst 0xc12d7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
- ".inst 0xc1257171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+ ".inst 0xc12f7332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc12e7333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1217350 // bfdot za.s[x11, 0], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc1207351 // bfdot za.s[x11, 1], { z26.h-z27.h }, z0.h\n"
".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12f7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
- ".inst 0xc12e7173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc12f7352 // bfdot za.s[x11, 2], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc12e7353 // bfdot za.s[x11, 3], { z26.h-z27.h }, z14.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
- "add x23, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
- "add x23, x23, %x[ld_in_row], LSL #2\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x23]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa19 // bfcvt z25.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa39 // bfcvtnt z25.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x23]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x22, SP, #12\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa3a // bfcvtnt z26.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x23]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc1297250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
- "ld1w { z26.s }, p0/Z, [x23]\n"
- "addvl x21, SP, #18\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab55 // bfcvt z21.h, p2/M, z26.s\n"
+ ".inst 0x658aaa1b // bfcvt z27.h, p2/M, z16.s\n"
+ ".inst 0xc12e7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z14.h\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1217251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12e7252 // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1267331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z6.h\n"
+ ".inst 0x648aaa3b // bfcvtnt z27.h, p2/M, z17.s\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x658aaa1c // bfcvt z28.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x23]\n"
- ".inst 0xc1267253 // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
- ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xc12b7332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z11.h\n"
+ ".inst 0xc12a7333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z10.h\n"
".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
- ".inst 0xc1277271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
- ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa1422ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12d7254 // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
- ".inst 0xc1257255 // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
- ".inst 0xc12e7272 // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
- ".inst 0xc1267273 // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xc12f7350 // bfdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0x648aaa1c // bfcvtnt z28.h, p2/M, z16.s\n"
+ ".inst 0xc1277351 // bfdot za.s[x11, 1], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12d7334 // bfdot za.s[x11, 4], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1257335 // bfdot za.s[x11, 5], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xc12f7352 // bfdot za.s[x11, 2], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc12e7353 // bfdot za.s[x11, 3], { z26.h-z27.h }, z14.h\n"
".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12f7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
- ".inst 0xc1277291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+ ".inst 0xc12c7370 // bfdot za.s[x11, 0], { z27.h-z28.h }, z12.h\n"
+ ".inst 0xc1247371 // bfdot za.s[x11, 1], { z27.h-z28.h }, z4.h\n"
".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12d7274 // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
- ".inst 0xc1257275 // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
- ".inst 0xc12f7292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
- ".inst 0xc12e7293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
- ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1237294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
- ".inst 0xc1227295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
+ ".inst 0xc12d7354 // bfdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1257355 // bfdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc12f7372 // bfdot za.s[x11, 2], { z27.h-z28.h }, z15.h\n"
+ ".inst 0xc12e7373 // bfdot za.s[x11, 3], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xa0422a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1257374 // bfdot za.s[x11, 4], { z27.h-z28.h }, z5.h\n"
+ ".inst 0xc1247375 // bfdot za.s[x11, 5], { z27.h-z28.h }, z4.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x24, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "addvl x20, SP, #24\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
- "add x24, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
- "add x24, x24, %x[ld_in_row], LSL #2\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0422ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa19 // bfcvt z25.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa39 // bfcvtnt z25.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x24]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "addvl x23, SP, #6\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa3a // bfcvtnt z26.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa1402ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
- "ld1w { z16.s }, p0/Z, [x24]\n"
- "addvl x22, SP, #12\n"
"add x24, x24, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x658aaa1b // bfcvt z27.h, p2/M, z16.s\n"
+ ".inst 0xc12e7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z14.h\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #18\n"
- ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
- "addvl x20, SP, #24\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1267331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z6.h\n"
+ ".inst 0x648aaa3b // bfcvtnt z27.h, p2/M, z17.s\n"
+ ".inst 0xa1402ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22]\n"
+ ".inst 0x658aaa1c // bfcvt z28.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x24]\n"
- ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
- ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
- ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc12a7332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z10.h\n"
+ ".inst 0xc1227333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7350 // bfdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0x648aaa1c // bfcvtnt z28.h, p2/M, z16.s\n"
+ ".inst 0xc1277351 // bfdot za.s[x11, 1], { z26.h-z27.h }, z7.h\n"
".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12d7134 // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
- ".inst 0xc1257135 // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xc12e7334 // bfdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1267335 // bfdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc1277153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc12f7352 // bfdot za.s[x11, 2], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc1277353 // bfdot za.s[x11, 3], { z26.h-z27.h }, z7.h\n"
".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12e7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
- ".inst 0xc1267171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xc1217370 // bfdot za.s[x11, 0], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc1207371 // bfdot za.s[x11, 1], { z27.h-z28.h }, z0.h\n"
".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12d7136 // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
- ".inst 0xc1257137 // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
- ".inst 0xc12f7154 // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc1277155 // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc12d7336 // bfdot za.s[x11, 6], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1257337 // bfdot za.s[x11, 7], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xc12f7354 // bfdot za.s[x11, 4], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc1277355 // bfdot za.s[x11, 5], { z26.h-z27.h }, z7.h\n"
".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12e7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
- ".inst 0xc1267173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xc12e7372 // bfdot za.s[x11, 2], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc1267373 // bfdot za.s[x11, 3], { z27.h-z28.h }, z6.h\n"
".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12f7156 // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc1277157 // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
- ".inst 0xc1297174 // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
- ".inst 0xc1217175 // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xc12f7356 // bfdot za.s[x11, 6], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc1277357 // bfdot za.s[x11, 7], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc1297374 // bfdot za.s[x11, 4], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc1217375 // bfdot za.s[x11, 5], { z27.h-z28.h }, z1.h\n"
".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1217176 // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
- ".inst 0xc1207177 // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
+ ".inst 0xc1217376 // bfdot za.s[x11, 6], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc1207377 // bfdot za.s[x11, 7], { z27.h-z28.h }, z0.h\n"
"17:" // Padded: 0 priming loads
- ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 20f\n"
"mov x12, #0x0\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
"ld1w { z16.s }, p0/Z, [x16]\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
- "add x20, x16, %x[ld_in_row], LSL #2\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "sub x15, x15, x25\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aaa1e // bfcvt z30.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
- "mov x12, #0x4\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa3e // bfcvtnt z30.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0x648aaa3f // bfcvtnt z31.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
+ ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa20 // bfcvtnt z0.h, p2/M, z17.s\n"
+ ".inst 0x658aaa01 // bfcvt z1.h, p2/M, z16.s\n"
"ld1w { z16.s }, p0/Z, [x20]\n"
- "sub x25, x25, #0x1\n"
- ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
- "sub x15, x15, #0x1\n"
- "cmp x25, x15\n"
- "csel x25, x25, x15, LT\n"
- "add x16, x16, %x[ld_in_col], LSL #2\n"
- "sub x15, x15, x25\n"
+ ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
"cbz x25, 19f\n"
"18:" // Padded: Main loop
- "addvl x24, SP, #6\n"
- ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
- "addvl x23, SP, #12\n"
- ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa0402b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
+ "addvl x24, SP, #6\n"
+ ".inst 0xc12b73d0 // bfdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1237292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
- "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0xc12373d1 // bfdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa1402b03 // ld1h { z3.h, z11.h }, pn10.b/Z, [x24]\n"
+ "addvl x23, SP, #12\n"
"add x22, x16, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1227293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
- ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
- "ld1w { z19.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
- ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
"subs x25, x25, #0x1\n"
+ "ld1w { z22.s }, p0/Z, [x16]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc12b73d2 // bfdot za.s[x11, 2], { z30.h-z31.h }, z11.h\n"
"add x16, x16, %x[ld_in_col], LSL #2\n"
- ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
- "ld1w { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc12373d3 // bfdot za.s[x11, 3], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12573f0 // bfdot za.s[x11, 0], { z31.h-z0.h }, z5.h\n"
+ ".inst 0xc12473f1 // bfdot za.s[x11, 1], { z31.h-z0.h }, z4.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc12e73d4 // bfdot za.s[x11, 4], { z30.h-z31.h }, z14.h\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12673d5 // bfdot za.s[x11, 5], { z30.h-z31.h }, z6.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f73f2 // bfdot za.s[x11, 2], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f3 // bfdot za.s[x11, 3], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12a7010 // bfdot za.s[x11, 0], { z0.h-z1.h }, z10.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
- "ld1w { z27.s }, p0/Z, [x22]\n"
+ ".inst 0xc1227011 // bfdot za.s[x11, 1], { z0.h-z1.h }, z2.h\n"
+ ".inst 0xa1422b06 // ld1h { z6.h, z14.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12d73d6 // bfdot za.s[x11, 6], { z30.h-z31.h }, z13.h\n"
+ ".inst 0xc12573d7 // bfdot za.s[x11, 7], { z30.h-z31.h }, z5.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f73f4 // bfdot za.s[x11, 4], { z31.h-z0.h }, z15.h\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
- ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
- "ld1w { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12773f5 // bfdot za.s[x11, 5], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7012 // bfdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267013 // bfdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12f73f6 // bfdot za.s[x11, 6], { z31.h-z0.h }, z15.h\n"
+ "ld1w { z23.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
- ".inst 0xa1422b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc12e7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
- "ld1w { z8.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12773f7 // bfdot za.s[x11, 7], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e7014 // bfdot za.s[x11, 4], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267015 // bfdot za.s[x11, 5], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa0422aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1267297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
- "ld1w { z11.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1237016 // bfdot za.s[x11, 6], { z0.h-z1.h }, z3.h\n"
+ ".inst 0xc1227017 // bfdot za.s[x11, 7], { z0.h-z1.h }, z2.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12c72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
- "ld1w { z18.s }, p0/Z, [x22]\n"
- ".inst 0xc12472d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
- ".inst 0xa1422ae4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
- ".inst 0xc12e72b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
- ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12c72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
- ".inst 0xc12472d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12172d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
- ".inst 0xc12072d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
- ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
- ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
- ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
- ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
- ".inst 0x648aaa74 // bfcvtnt z20.h, p2/M, z19.s\n"
- ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
- ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
- ".inst 0xc12012d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
- ".inst 0x658aa956 // bfcvt z22.h, p2/M, z10.s\n"
- ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc12413d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z4.h\n"
+ ".inst 0x658aaade // bfcvt z30.h, p2/M, z22.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12f13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12713f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z7.h\n"
+ ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
+ ".inst 0xa0412be4 // ld1h { z4.h-z5.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0x648aa93e // bfcvtnt z30.h, p2/M, z9.s\n"
+ ".inst 0xc12a1010 // bfdot za.s[x8, 0], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc1221011 // bfdot za.s[x8, 1], { z0.h-z1.h }, z2.h\n"
+ ".inst 0x658aaae0 // bfcvt z0.h, p2/M, z23.s\n"
+ ".inst 0x658aaa21 // bfcvt z1.h, p2/M, z17.s\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
- ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- ".inst 0x648aa916 // bfcvtnt z22.h, p2/M, z8.s\n"
- ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
- "st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x5, LSL #2\n"
- "st1w { z2.s }, p1, [x13]\n"
- "add x13, x13, x10, LSL #2\n"
+ ".inst 0xa1422be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aab1f // bfcvtnt z31.h, p2/M, z24.s\n"
+ ".inst 0xc006680c // mova { z12.d-z13.d }, za.d[x11, #0]\n"
+ ".inst 0x648aaa80 // bfcvtnt z0.h, p2/M, z20.s\n"
+ ".inst 0x648aaa01 // bfcvtnt z1.h, p2/M, z16.s\n"
+ ".inst 0xc006682e // mova { z14.d-z15.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- "st1w { z1.s }, p1, [x9]\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1bdc90c // fclamp { z12.s-z15.s }, z8.s, z29.s\n"
+ "st1w { z12.s }, p1, [x14]\n"
+ "add x14, x14, x2, LSL #2\n"
+ "st1w { z14.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "st1w { z13.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
- ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
- "st1w { z3.s }, p1, [x28]\n"
+ "st1w { z15.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 18b\n"
"19:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc12b73d0 // bfdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12373d1 // bfdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc1217292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
- ".inst 0xc1207293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
- ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
- ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
- ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xc12b73d2 // bfdot za.s[x11, 2], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc12373d3 // bfdot za.s[x11, 3], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0402acc // ld1h { z12.h-z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12573f0 // bfdot za.s[x11, 0], { z31.h-z0.h }, z5.h\n"
+ ".inst 0xc12473f1 // bfdot za.s[x11, 1], { z31.h-z0.h }, z4.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12d73d4 // bfdot za.s[x11, 4], { z30.h-z31.h }, z13.h\n"
+ ".inst 0xc12c73d5 // bfdot za.s[x11, 5], { z30.h-z31.h }, z12.h\n"
".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
- ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12e73f2 // bfdot za.s[x11, 2], { z31.h-z0.h }, z14.h\n"
+ ".inst 0xc12673f3 // bfdot za.s[x11, 3], { z31.h-z0.h }, z6.h\n"
".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
- ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xc12a7010 // bfdot za.s[x11, 0], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc1227011 // bfdot za.s[x11, 1], { z0.h-z1.h }, z2.h\n"
".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
- ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
- ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12d73d6 // bfdot za.s[x11, 6], { z30.h-z31.h }, z13.h\n"
+ ".inst 0xc12573d7 // bfdot za.s[x11, 7], { z30.h-z31.h }, z5.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f73f4 // bfdot za.s[x11, 4], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f5 // bfdot za.s[x11, 5], { z31.h-z0.h }, z7.h\n"
".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc12e72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
- ".inst 0xc12672d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc12e7012 // bfdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267013 // bfdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
- ".inst 0xc12772b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12f73f6 // bfdot za.s[x11, 6], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12773f7 // bfdot za.s[x11, 7], { z31.h-z0.h }, z7.h\n"
".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc12e72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
- ".inst 0xc12672d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc12e7014 // bfdot za.s[x11, 4], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc1267015 // bfdot za.s[x11, 5], { z0.h-z1.h }, z6.h\n"
".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc12b72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
- ".inst 0xc12372d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xc12b7016 // bfdot za.s[x11, 6], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc1237017 // bfdot za.s[x11, 7], { z0.h-z1.h }, z3.h\n"
".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
- ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
- ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
- ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
- ".inst 0xc12312d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
- ".inst 0xc12212d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
+ ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
+ ".inst 0xc12413d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z4.h\n"
+ ".inst 0xc12f13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z15.h\n"
+ ".inst 0xc12713f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z7.h\n"
+ ".inst 0xc1231010 // bfdot za.s[x8, 0], { z0.h-z1.h }, z3.h\n"
+ ".inst 0xc1221011 // bfdot za.s[x8, 1], { z0.h-z1.h }, z2.h\n"
"add x8, x8, #0x2\n"
".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
- ".inst 0xc1bccbb4 // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1bdc914 // fclamp { z20.s-z23.s }, z8.s, z29.s\n"
"st1w { z20.s }, p1, [x14]\n"
- "add x14, x14, x5, LSL #2\n"
+ "add x14, x14, x2, LSL #2\n"
"st1w { z22.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "add x11, x11, #0x2\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z21.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"st1w { z23.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"20:" // Main loop skip tail
@@ -1100,27 +1105,27 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
- ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1bdc900 // fclamp { z0.s-z3.s }, z8.s, z29.s\n"
"st1w { z0.s }, p1, [x14]\n"
- "add x14, x14, x5, LSL #2\n"
+ "add x14, x14, x2, LSL #2\n"
"st1w { z2.s }, p1, [x13]\n"
"add x13, x13, x10, LSL #2\n"
- "add x11, x11, #0x2\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
"st1w { z1.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
"st1w { z3.s }, p1, [x28]\n"
"add x28, x28, x26, LSL #2\n"
"bgt 21b\n"
"22:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1139,9 +1144,11 @@ void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
index 3a56e69d26..936e963915 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,162 +70,167 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
"ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "mov x20, #0xb\n"
+ "mov x22, SP\n"
+ "mov x21, #0xb\n"
".inst 0xd503477f // SMSTART ZA\n"
- "sub x20, x20, x3\n"
"ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x22, #0x8\n"
"ptrue p2.b\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
- "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
"ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z30.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "sub x21, x21, x3\n"
+ "mov SP, x20\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z31.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-15\n"
"whilelt p1.s, XZR, x5\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p9.s, XZR, x21\n"
"whilelt p8.s, XZR, x4\n"
- "addvl SP, SP, #-15\n"
- "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
"1:" // Channel loop
"ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
- "fmov z16.s, #0x0\n"
+ "fmov z24.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x6, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x21\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
+ "ldr x27, [%x[args], %[offsetof_Args_weights]]\n"
+ "addvl x26, SP, #15\n"
+ "mov x25, #0xb\n"
+ "mov z25.d, z24.d\n"
+ "addvl x26, x26, #-3\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "add x24, x4, x3\n"
+ "mov z26.d, z24.d\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x23, %x[ld_in_row], #0x2\n"
+ "mov z27.d, z24.d\n"
+ "mov x8, #0x0\n"
+ "mov x22, x27\n"
+ "incb x27\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "ld1w { z15.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "sub x20, x7, #0x1\n"
+ "ld1w { z23.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "orr x21, x20, %x[ld_in_col], LSL #18\n"
+ "ld1w { z14.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "orr x21, x5, x21, LSL #20\n"
+ ".inst 0x658aa9fd // bfcvt z29.h, p2/M, z15.s\n"
+ "ld1w { z15.s }, p2/Z, [x22]\n"
+ "incb x22, ALL, MUL #5\n"
+ "lsl x21, x21, #0x2\n"
+ "ld1w { z0.s }, p2/Z, [x22]\n"
+ "mov x20, x27\n"
+ "incb x27\n"
+ ".inst 0x658aa9ce // bfcvt z14.h, p2/M, z14.s\n"
+ "ld1w { z22.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "ld1w { z8.s }, p2/Z, [x20]\n"
+ "sub x25, x25, x24\n"
+ "madd x23, x23, x4, x17\n"
+ ".inst 0x648aaafd // bfcvtnt z29.h, p2/M, z23.s\n"
+ "ld1w { z20.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
- "incb x21\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "incb x20, ALL, MUL #5\n"
- ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
- "addvl x24, SP, #15\n"
+ ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
"ld1w { z17.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x648aa90f // bfcvtnt z15.h, p2/M, z8.s\n"
- "addvl x24, x24, #-3\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- "st1h { z15.h }, p2, [x24]\n"
- ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
- "ld1w { z29.s }, p2/Z, [x20]\n"
+ ".inst 0x648aa9ee // bfcvtnt z14.h, p2/M, z15.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aabb5 // bfcvt z21.h, p2/M, z29.s\n"
- "incb x21\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "mov x20, x27\n"
+ "incb x27\n"
+ "st1h { z29.h }, p2, [x26]\n"
+ ".inst 0x658aaad3 // bfcvt z19.h, p2/M, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aaa58 // bfcvt z24.h, p2/M, z18.s\n"
- "ld1w { z26.s }, p2/Z, [x20]\n"
+ "st1h { z14.h }, p2, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaa21 // bfcvt z1.h, p2/M, z17.s\n"
+ "st1h { z28.h }, p2, [x26, #2, MUL VL]\n"
+ "addvl x26, x26, #-3\n"
+ ".inst 0x658aa965 // bfcvt z5.h, p2/M, z11.s\n"
+ ".inst 0x648aaa93 // bfcvtnt z19.h, p2/M, z20.s\n"
+ "ld1w { z13.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aab41 // bfcvt z1.h, p2/M, z26.s\n"
- ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ "ld1w { z0.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aa841 // bfcvtnt z1.h, p2/M, z2.s\n"
"ld1w { z17.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
- "addvl x24, x24, #-3\n"
- "ld1w { z9.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- "st1h { z21.h }, p2, [x24]\n"
- ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
- "ld1w { z3.s }, p2/Z, [x20]\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ "mov x20, x27\n"
+ "incb x27\n"
+ "st1h { z19.h }, p2, [x26]\n"
+ ".inst 0x658aaad3 // bfcvt z19.h, p2/M, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "incb x21\n"
- ".inst 0x658aa864 // bfcvt z4.h, p2/M, z3.s\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
+ "st1h { z1.h }, p2, [x26, #1, MUL VL]\n"
+ ".inst 0x658aa814 // bfcvt z20.h, p2/M, z0.s\n"
+ "st1h { z5.h }, p2, [x26, #2, MUL VL]\n"
+ "addvl x26, x26, #-3\n"
+ ".inst 0x658aabaf // bfcvt z15.h, p2/M, z29.s\n"
+ ".inst 0x648aa9b3 // bfcvtnt z19.h, p2/M, z13.s\n"
+ "ld1w { z7.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa92b // bfcvt z11.h, p2/M, z9.s\n"
- "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aaa46 // bfcvt z6.h, p2/M, z18.s\n"
- "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x20]\n"
+ ".inst 0x648aaa34 // bfcvtnt z20.h, p2/M, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "addvl x24, x24, #-3\n"
- ".inst 0x648aabe4 // bfcvtnt z4.h, p2/M, z31.s\n"
- "ld1w { z27.s }, p2/Z, [x20]\n"
- "mov x20, x21\n"
- "st1h { z4.h }, p2, [x24]\n"
- ".inst 0x648aa8a6 // bfcvtnt z6.h, p2/M, z5.s\n"
- "ld1w { z9.s }, p2/Z, [x20]\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "mov x20, x27\n"
+ "st1h { z19.h }, p2, [x26]\n"
+ ".inst 0x658aaa5c // bfcvt z28.h, p2/M, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aa938 // bfcvt z24.h, p2/M, z9.s\n"
- "incb x21\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
+ "st1h { z20.h }, p2, [x26, #1, MUL VL]\n"
+ ".inst 0x658aa9b6 // bfcvt z22.h, p2/M, z13.s\n"
+ "st1h { z15.h }, p2, [x26, #2, MUL VL]\n"
+ "addvl x26, x26, #-3\n"
+ ".inst 0x658aa965 // bfcvt z5.h, p2/M, z11.s\n"
+ ".inst 0x648aa8fc // bfcvtnt z28.h, p2/M, z7.s\n"
+ "ld1w { z19.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
- "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z31.s }, p2/Z, [x20]\n"
+ "ld1w { z13.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- ".inst 0x648aaa38 // bfcvtnt z24.h, p2/M, z17.s\n"
- ".inst 0x658aabf9 // bfcvt z25.h, p2/M, z31.s\n"
- "ld1w { z18.s }, p2/Z, [x20]\n"
+ ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
"incb x20, ALL, MUL #5\n"
- "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x20]\n"
- "mov x21, x21\n"
- "addvl x24, x24, #-3\n"
- "st1h { z24.h }, p2, [x24]\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x648aaa59 // bfcvtnt z25.h, p2/M, z18.s\n"
- "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
- "ld1w { z8.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
- ".inst 0x658aa976 // bfcvt z22.h, p2/M, z11.s\n"
- "ld1w { z28.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- ".inst 0x658aab85 // bfcvt z5.h, p2/M, z28.s\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "ld1w { z25.s }, p2/Z, [x21]\n"
- "incb x21, ALL, MUL #5\n"
- "sub x20, x7, #0x1\n"
- "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
- "ld1w { z11.s }, p2/Z, [x21]\n"
- "orr x23, x20, %x[ld_in_col], LSL #18\n"
- "addvl x24, x24, #-3\n"
- "mov z17.d, z16.d\n"
- "orr x23, x5, x23, LSL #20\n"
- "mov x22, #0xb\n"
- "mov z18.d, z16.d\n"
- "mov z19.d, z16.d\n"
- "add x21, x4, x3\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- ".inst 0x648aa909 // bfcvtnt z9.h, p2/M, z8.s\n"
- "st1h { z9.h }, p2, [x24]\n"
- ".inst 0x648aab25 // bfcvtnt z5.h, p2/M, z25.s\n"
- "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
- ".inst 0x658aa97b // bfcvt z27.h, p2/M, z11.s\n"
- "mov x8, #0x0\n"
- "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
- "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
- "lsl x23, x23, #0x2\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x4, x17\n"
+ "ld1w { z0.s }, p2/Z, [x20]\n"
+ "st1h { z28.h }, p2, [x26]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "st1h { z22.h }, p2, [x26, #1, MUL VL]\n"
+ ".inst 0x658aa9b2 // bfcvt z18.h, p2/M, z13.s\n"
+ "st1h { z5.h }, p2, [x26, #2, MUL VL]\n"
+ "addvl x26, x26, #-3\n"
+ ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
+ ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
+ ".inst 0x648aa852 // bfcvtnt z18.h, p2/M, z2.s\n"
+ "st1h { z16.h }, p2, [x26]\n"
+ "st1h { z18.h }, p2, [x26, #1, MUL VL]\n"
+ "st1h { z28.h }, p2, [x26, #2, MUL VL]\n"
"3:" // Issue prefetches
- "subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xf8b54afc // rprfm pldstrm, x21, [x23]\n"
+ "add x23, x23, %x[ld_in_col], LSL #2\n"
"bgt 3b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x2\n"
- "msub x17, x4, x20, x17\n"
- ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "lsl x21, %x[ld_in_row], #0x2\n"
+ ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
"mov x22, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x17, x4, x21, x17\n"
+ ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
"ldp x15, x14, [x23], #0x10\n"
- ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc0040f03 // mova za.d[x8, #3], { z24.d-z27.d }\n"
"ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040e03 // mova za.d[x8, #3], { z16.d-z19.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
"ldp x10, x9, [x23], #0x10\n"
"ldp x28, x27, [x20], #0x10\n"
"cbz x21, 5f\n"
@@ -234,21 +239,21 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 5f\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
"add x21, x21, #0x1\n"
- ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
"lsr x21, x21, #0x1\n"
"sub x16, x16, x21\n"
+ ".inst 0xc1bfcbc0 // fclamp { z0.s-z3.s }, z30.s, z31.s\n"
"4:" // Left padding
"subs x21, x21, #0x1\n"
- "st1w { z4.s }, p1, [x15]\n"
+ "st1w { z0.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- "st1w { z5.s }, p1, [x14]\n"
+ "st1w { z1.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z6.s }, p1, [x10]\n"
+ "st1w { z2.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z7.s }, p1, [x9]\n"
+ "st1w { z3.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 4b\n"
"5:" // Left padding: End
@@ -264,331 +269,331 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"beq 7f\n"
"6:" // Unpadded: 4 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z0.s }, p1/Z, [x17]\n"
- ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ "ld1w { z2.s }, p1/Z, [x17]\n"
"addvl x20, SP, #12\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa936 // bfcvtnt z22.h, p2/M, z9.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z28.s }, p1/Z, [x21]\n"
+ "ld1w { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab97 // bfcvt z23.h, p2/M, z28.s\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa97 // bfcvtnt z23.h, p2/M, z20.s\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa84d // bfcvt z13.h, p2/M, z2.s\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa98 // bfcvt z24.h, p2/M, z20.s\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aabb8 // bfcvtnt z24.h, p2/M, z29.s\n"
- "ld1w { z30.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa8ae // bfcvt z14.h, p2/M, z5.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aabd9 // bfcvt z25.h, p2/M, z30.s\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ "ld1w { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa6f // bfcvt z15.h, p2/M, z19.s\n"
+ ".inst 0x648aaa4d // bfcvtnt z13.h, p2/M, z18.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa93a // bfcvtnt z26.h, p2/M, z9.s\n"
- ".inst 0xc13b12f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
- ".inst 0x658aa93b // bfcvt z27.h, p2/M, z9.s\n"
- "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+ ".inst 0x648aaaaf // bfcvtnt z15.h, p2/M, z21.s\n"
+ ".inst 0x658aaa91 // bfcvt z17.h, p2/M, z20.s\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
+ "ld1h { z12.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0x648aa830 // bfcvtnt z16.h, p2/M, z1.s\n"
+ ".inst 0x658aaa72 // bfcvt z18.h, p2/M, z19.s\n"
+ ".inst 0x648aa871 // bfcvtnt z17.h, p2/M, z3.s\n"
+ ".inst 0xc13411b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z4.h\n"
+ ".inst 0xc13511d0 // bfdot za.s[x8, 0], { z14.h-z17.h }, z5.h\n"
+ ".inst 0xc13c11f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z12.h\n"
"7:" // Unpadded: 3 priming loads
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x17]\n"
- ".inst 0x658aab7d // bfcvt z29.h, p2/M, z27.s\n"
+ "ld1w { z17.s }, p1/Z, [x17]\n"
"addvl x20, SP, #9\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa93e // bfcvt z30.h, p2/M, z9.s\n"
- "ld1w { z20.s }, p1/Z, [x21]\n"
+ "ld1w { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa9e // bfcvtnt z30.h, p2/M, z20.s\n"
- "ld1w { z25.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa25 // bfcvt z5.h, p2/M, z17.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab5f // bfcvtnt z31.h, p2/M, z26.s\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa966 // bfcvt z6.h, p2/M, z11.s\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ "ld1w { z10.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa920 // bfcvtnt z0.h, p2/M, z9.s\n"
- "ld1w { z23.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa47 // bfcvt z7.h, p2/M, z18.s\n"
+ ".inst 0x648aa885 // bfcvtnt z5.h, p2/M, z4.s\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaae1 // bfcvt z1.h, p2/M, z23.s\n"
- ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc13413b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
- "ld1w { z9.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa9c8 // bfcvt z8.h, p2/M, z14.s\n"
+ ".inst 0x648aa9a6 // bfcvtnt z6.h, p2/M, z13.s\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa921 // bfcvtnt z1.h, p2/M, z9.s\n"
- ".inst 0xc13513d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
- "ld1w { z29.s }, p1/Z, [x21]\n"
- ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
- "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ ".inst 0x648aa987 // bfcvtnt z7.h, p2/M, z12.s\n"
+ ".inst 0x658aa809 // bfcvt z9.h, p2/M, z0.s\n"
+ "ld1w { z0.s }, p1/Z, [x21]\n"
+ "ld1h { z2.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
+ ".inst 0x658aa80a // bfcvt z10.h, p2/M, z0.s\n"
+ ".inst 0x648aa889 // bfcvtnt z9.h, p2/M, z4.s\n"
+ ".inst 0xc13310b0 // bfdot za.s[x8, 0], { z5.h-z8.h }, z3.h\n"
+ ".inst 0xc13b10d0 // bfdot za.s[x8, 0], { z6.h-z9.h }, z11.h\n"
+ ".inst 0xc13210f0 // bfdot za.s[x8, 0], { z7.h-z10.h }, z2.h\n"
"8:" // Unpadded: 2 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x17]\n"
- ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
+ "ld1w { z11.s }, p1/Z, [x17]\n"
"addvl x21, SP, #6\n"
- "ld1w { z21.s }, p1/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaaba // bfcvtnt z26.h, p2/M, z21.s\n"
"addvl x20, SP, #12\n"
- "ld1w { z25.s }, p1/Z, [x22]\n"
+ "ld1w { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab3b // bfcvt z27.h, p2/M, z25.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z4.s }, p1/Z, [x22]\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa89b // bfcvtnt z27.h, p2/M, z4.s\n"
- "ld1w { z10.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "ld1w { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa95c // bfcvt z28.h, p2/M, z10.s\n"
- "ld1w { z4.s }, p1/Z, [x22]\n"
+ "ld1w { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa89c // bfcvtnt z28.h, p2/M, z4.s\n"
- "ld1w { z5.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaa6c // bfcvt z12.h, p2/M, z19.s\n"
+ "ld1w { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8bd // bfcvt z29.h, p2/M, z5.s\n"
- "ld1w { z5.s }, p1/Z, [x22]\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa8bd // bfcvtnt z29.h, p2/M, z5.s\n"
- "ld1w { z5.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaa4d // bfcvt z13.h, p2/M, z18.s\n"
+ ".inst 0x648aaaeb // bfcvtnt z11.h, p2/M, z23.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8be // bfcvt z30.h, p2/M, z5.s\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc13e1350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
- "ld1w { z5.s }, p1/Z, [x22]\n"
- ".inst 0x648aa8be // bfcvtnt z30.h, p2/M, z5.s\n"
+ ".inst 0x658aaa2e // bfcvt z14.h, p2/M, z17.s\n"
+ ".inst 0x648aab8c // bfcvtnt z12.h, p2/M, z28.s\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13f1370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
- ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1381351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
- "ld1w { z23.s }, p1/Z, [x22]\n"
- ".inst 0x658aaaff // bfcvt z31.h, p2/M, z23.s\n"
- ".inst 0xc1391371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ ".inst 0x648aa8cd // bfcvtnt z13.h, p2/M, z6.s\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0x648aabae // bfcvtnt z14.h, p2/M, z29.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0x648aaa8f // bfcvtnt z15.h, p2/M, z20.s\n"
+ ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc13a1190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1301171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc13311b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z3.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1311191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc13011b1 // bfdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x17]\n"
- ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ "ld1w { z16.s }, p1/Z, [x17]\n"
"addvl x21, SP, #3\n"
- "ld1w { z24.s }, p1/Z, [x22]\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
"addvl x20, SP, #9\n"
- "ld1w { z31.s }, p1/Z, [x22]\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aabf8 // bfcvt z24.h, p2/M, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z6.s }, p1/Z, [x22]\n"
+ "ld1w { z0.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa8d8 // bfcvtnt z24.h, p2/M, z6.s\n"
+ ".inst 0x658aaa11 // bfcvt z17.h, p2/M, z16.s\n"
"ld1w { z28.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab99 // bfcvt z25.h, p2/M, z28.s\n"
- "ld1w { z26.s }, p1/Z, [x22]\n"
+ "ld1w { z22.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
- "ld1w { z28.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaaf2 // bfcvt z18.h, p2/M, z23.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab9a // bfcvt z26.h, p2/M, z28.s\n"
- "ld1w { z4.s }, p1/Z, [x22]\n"
+ "ld1w { z8.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa89a // bfcvtnt z26.h, p2/M, z4.s\n"
- "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0x658aab93 // bfcvt z19.h, p2/M, z28.s\n"
+ ".inst 0x648aa8b1 // bfcvtnt z17.h, p2/M, z5.s\n"
+ "ld1w { z21.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
- ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
- "ld1w { z20.s }, p1/Z, [x22]\n"
- ".inst 0x648aaa9b // bfcvtnt z27.h, p2/M, z20.s\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0x648aa812 // bfcvtnt z18.h, p2/M, z0.s\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc1381310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc13212f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
- "ld1w { z11.s }, p1/Z, [x22]\n"
- ".inst 0x658aa97c // bfcvt z28.h, p2/M, z11.s\n"
- ".inst 0xc1331311 // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
- "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1301331 // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ ".inst 0x648aaad3 // bfcvtnt z19.h, p2/M, z22.s\n"
+ ".inst 0x658aaab5 // bfcvt z21.h, p2/M, z21.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0x648aa914 // bfcvtnt z20.h, p2/M, z8.s\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ ".inst 0x648aa9b5 // bfcvtnt z21.h, p2/M, z13.s\n"
+ ".inst 0xc1321230 // bfdot za.s[x8, 0], { z17.h-z20.h }, z2.h\n"
+ ".inst 0xc13a1250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z10.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1331231 // bfdot za.s[x8, 1], { z17.h-z20.h }, z3.h\n"
+ ".inst 0xc1301270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z0.h\n"
+ "ld1h { z7.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13b1251 // bfdot za.s[x8, 1], { z18.h-z21.h }, z11.h\n"
+ ".inst 0xc1371271 // bfdot za.s[x8, 1], { z19.h-z22.h }, z7.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z9.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x17]\n"
- ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "ld1w { z10.s }, p1/Z, [x17]\n"
"sub x7, x7, #0x2\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
"sub x16, x16, #0x1\n"
- ".inst 0x648aab55 // bfcvtnt z21.h, p2/M, z26.s\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab56 // bfcvt z22.h, p2/M, z26.s\n"
"lsr x20, x7, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa941 // bfcvt z1.h, p2/M, z10.s\n"
"cmp x20, x16\n"
- ".inst 0x648aab56 // bfcvtnt z22.h, p2/M, z26.s\n"
- "ld1w { z8.s }, p1/Z, [x21]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa917 // bfcvt z23.h, p2/M, z8.s\n"
"csel x26, x20, x16, LT\n"
- "ld1w { z2.s }, p1/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa857 // bfcvtnt z23.h, p2/M, z2.s\n"
+ ".inst 0x658aaaa2 // bfcvt z2.h, p2/M, z21.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
"ld1w { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8d8 // bfcvt z24.h, p2/M, z6.s\n"
"and x7, x7, #0x1\n"
- "ld1w { z15.s }, p1/Z, [x21]\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
- "sub x16, x16, x26\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab83 // bfcvt z3.h, p2/M, z28.s\n"
+ ".inst 0x648aa9c1 // bfcvtnt z1.h, p2/M, z14.s\n"
+ "ld1w { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
- "ld1w { z26.s }, p1/Z, [x21]\n"
+ "sub x16, x16, x26\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
- "ld1w { z27.s }, p1/Z, [x21]\n"
- ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
+ ".inst 0x658aa8c4 // bfcvt z4.h, p2/M, z6.s\n"
+ ".inst 0x648aaa62 // bfcvtnt z2.h, p2/M, z19.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa8a5 // bfcvt z5.h, p2/M, z5.s\n"
+ ".inst 0x648aaa03 // bfcvtnt z3.h, p2/M, z16.s\n"
+ ".inst 0x648aaae4 // bfcvtnt z4.h, p2/M, z23.s\n"
+ ".inst 0x658aaba6 // bfcvt z6.h, p2/M, z29.s\n"
+ ".inst 0x648aa985 // bfcvtnt z5.h, p2/M, z12.s\n"
"cbz x26, 19f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
"addvl x25, SP, #6\n"
"addvl x24, SP, #12\n"
- "ld1w { z14.s }, p1/Z, [x17]\n"
- ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
- ".inst 0xa1402b20 // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
+ "ld1w { z10.s }, p1/Z, [x17]\n"
"add x23, x17, %x[ld_in_row], LSL #2\n"
"addvl x22, SP, #3\n"
- ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1w { z27.s }, p1/Z, [x23]\n"
+ "ld1w { z13.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13812d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "ld1w { z11.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
"addvl x21, SP, #9\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ ".inst 0xa1402b27 // ld1h { z7.h, z15.h }, pn10.b/Z, [x25]\n"
"add x20, x17, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
- "ld1w { z2.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9d5 // bfcvt z21.h, p2/M, z14.s\n"
- ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
- "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
"subs x26, x26, #0x1\n"
- "ld1w { z14.s }, p1/Z, [x23]\n"
+ "ld1w { z22.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13812d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
- ".inst 0x658aa856 // bfcvt z22.h, p2/M, z2.s\n"
- "ld1w { z7.s }, p1/Z, [x23]\n"
+ "ld1w { z21.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13b12f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
- ".inst 0x648aa9d6 // bfcvtnt z22.h, p2/M, z14.s\n"
- "ld1w { z31.s }, p1/Z, [x23]\n"
+ ".inst 0xc1371031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z7.h\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
- ".inst 0xc1acc9a8 // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
- ".inst 0x658aa8f7 // bfcvt z23.h, p2/M, z7.s\n"
- "add x8, x8, #0x1\n"
- "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0xc1391070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z8.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
- ".inst 0x648aabf7 // bfcvtnt z23.h, p2/M, z31.s\n"
- "ld1w { z2.s }, p1/Z, [x23]\n"
+ "ld1w { z20.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa858 // bfcvtnt z24.h, p2/M, z2.s\n"
- "st1w { z8.s }, p1, [x15]\n"
- "ld1w { z0.s }, p1/Z, [x23]\n"
+ ".inst 0xc13f1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z15.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "ld1w { z29.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa819 // bfcvt z25.h, p2/M, z0.s\n"
- "add x15, x15, x13, LSL #2\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc13212b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
- "st1w { z9.s }, p1, [x14]\n"
- "add x14, x14, x11, LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x23]\n"
- ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "ld1w { z9.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc13112b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
- "st1w { z10.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
- "ld1w { z26.s }, p1/Z, [x23]\n"
- ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
- ".inst 0xc13912d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
- "ld1w { z31.s }, p1/Z, [x17]\n"
- ".inst 0x658aabf5 // bfcvt z21.h, p2/M, z31.s\n"
- "st1w { z11.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- "ld1w { z30.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
- ".inst 0x648aabd5 // bfcvtnt z21.h, p2/M, z30.s\n"
- "ld1w { z0.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ ".inst 0xc13e1032 // bfdot za.s[x8, 2], { z1.h-z4.h }, z14.h\n"
+ ".inst 0x658aa941 // bfcvt z1.h, p2/M, z10.s\n"
+ "ld1w { z23.s }, p1/Z, [x23]\n"
+ ".inst 0xc13c1071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [x17]\n"
+ ".inst 0xc1bfcbd0 // fclamp { z16.s-z19.s }, z30.s, z31.s\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z1.s }, p1/Z, [x20]\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1052 // bfdot za.s[x8, 2], { z2.h-z5.h }, z15.h\n"
+ ".inst 0x658aa962 // bfcvt z2.h, p2/M, z11.s\n"
+ ".inst 0x648aa9a1 // bfcvtnt z1.h, p2/M, z13.s\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa836 // bfcvtnt z22.h, p2/M, z1.s\n"
+ "st1w { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
"ld1w { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc13212f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- "ld1w { z28.s }, p1/Z, [x20]\n"
+ "st1w { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ ".inst 0xc13c1072 // bfdot za.s[x8, 2], { z3.h-z6.h }, z12.h\n"
+ ".inst 0x658aaaa3 // bfcvt z3.h, p2/M, z21.s\n"
+ ".inst 0x658aa904 // bfcvt z4.h, p2/M, z8.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x648aaac2 // bfcvtnt z2.h, p2/M, z22.s\n"
+ ".inst 0x658aaba5 // bfcvt z5.h, p2/M, z29.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaae6 // bfcvt z6.h, p2/M, z23.s\n"
+ "ld1h { z8.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "st1w { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z14.s }, p1/Z, [x20]\n"
+ "st1w { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0x648aa803 // bfcvtnt z3.h, p2/M, z0.s\n"
+ ".inst 0x648aaa84 // bfcvtnt z4.h, p2/M, z20.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa925 // bfcvtnt z5.h, p2/M, z9.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc13412f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
- ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
- "ld1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa9d8 // bfcvt z24.h, p2/M, z14.s\n"
- ".inst 0x658aabb9 // bfcvt z25.h, p2/M, z29.s\n"
- "ld1w { z5.s }, p1/Z, [x20]\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab97 // bfcvtnt z23.h, p2/M, z28.s\n"
- ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
- "ld1w { z11.s }, p1/Z, [x20]\n"
- ".inst 0x648aa8b9 // bfcvtnt z25.h, p2/M, z5.s\n"
- ".inst 0x658aa97a // bfcvt z26.h, p2/M, z11.s\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xc13e1030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z14.h\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc13f1050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z15.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13e1031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z14.h\n"
+ ".inst 0x658aab81 // bfcvt z1.h, p2/M, z28.s\n"
+ ".inst 0xc1381070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z8.h\n"
+ "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13f1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z15.h\n"
+ ".inst 0x658aa9a2 // bfcvt z2.h, p2/M, z13.s\n"
+ ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
+ ".inst 0x648aa941 // bfcvtnt z1.h, p2/M, z10.s\n"
+ ".inst 0xc13e1071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z14.h\n"
+ ".inst 0x658aa983 // bfcvt z3.h, p2/M, z12.s\n"
+ ".inst 0x658aaaa4 // bfcvt z4.h, p2/M, z21.s\n"
+ ".inst 0x658aaa85 // bfcvt z5.h, p2/M, z20.s\n"
+ "ld1h { z9.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aa962 // bfcvtnt z2.h, p2/M, z11.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
+ ".inst 0x648aa8e3 // bfcvtnt z3.h, p2/M, z7.s\n"
+ ".inst 0x648aaa04 // bfcvtnt z4.h, p2/M, z16.s\n"
+ ".inst 0x648aaa45 // bfcvtnt z5.h, p2/M, z18.s\n"
"bgt 11b\n"
"b 19f\n"
"12:" // Padded
@@ -602,620 +607,620 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z1.s }, p0/Z, [x17]\n"
- ".inst 0x658aa837 // bfcvt z23.h, p2/M, z1.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #12\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
- ".inst 0x648aabb7 // bfcvtnt z23.h, p2/M, z29.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z30.s }, p0/Z, [x21]\n"
- ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z15.s }, p0/Z, [x21]\n"
- ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
- "mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa99 // bfcvtnt z25.h, p2/M, z20.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z10.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa95a // bfcvt z26.h, p2/M, z10.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z8.s }, p0/Z, [x21]\n"
- ".inst 0x648aa91a // bfcvtnt z26.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z28.s }, p0/Z, [x21]\n"
- ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
- "addvl x20, SP, #12\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc13112f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z28.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab9b // bfcvtnt z27.h, p2/M, z28.s\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
- ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
- ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc1301330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0xc13a1190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc13011b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"14:" // Padded: 3 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z21.s }, p0/Z, [x17]\n"
- ".inst 0x658aaab4 // bfcvt z20.h, p2/M, z21.s\n"
"add x21, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
- ".inst 0x648aab74 // bfcvtnt z20.h, p2/M, z27.s\n"
- "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
- ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
- ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
- "mov x12, #0x4\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z29.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aabb6 // bfcvt z22.h, p2/M, z29.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z8.s }, p0/Z, [x21]\n"
- ".inst 0x648aa917 // bfcvtnt z23.h, p2/M, z8.s\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z28.s }, p0/Z, [x21]\n"
- ".inst 0x658aab98 // bfcvt z24.h, p2/M, z28.s\n"
- "addvl x20, SP, #9\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1311290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z0.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa818 // bfcvtnt z24.h, p2/M, z0.s\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z1.s }, p0/Z, [x21]\n"
- ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
- ".inst 0xc13912b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13012d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0xc13a1190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xc13011b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"15:" // Padded: 2 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z6.s }, p0/Z, [x17]\n"
- ".inst 0x658aa8da // bfcvt z26.h, p2/M, z6.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #12\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z29.s }, p0/Z, [x22]\n"
- ".inst 0x648aabba // bfcvtnt z26.h, p2/M, z29.s\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z28.s }, p0/Z, [x22]\n"
- ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z14.s }, p0/Z, [x22]\n"
- ".inst 0x648aa9db // bfcvtnt z27.h, p2/M, z14.s\n"
- "mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab1c // bfcvt z28.h, p2/M, z24.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z1.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa83c // bfcvtnt z28.h, p2/M, z1.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z3.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa87d // bfcvt z29.h, p2/M, z3.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z0.s }, p0/Z, [x22]\n"
- ".inst 0x648aa81d // bfcvtnt z29.h, p2/M, z0.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x22]\n"
- ".inst 0x658aab1e // bfcvt z30.h, p2/M, z24.s\n"
- "addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1311350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z23.s }, p0/Z, [x22]\n"
- ".inst 0x648aaafe // bfcvtnt z30.h, p2/M, z23.s\n"
- "addvl x20, SP, #12\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1391370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0xc1391190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "ld1w { z31.s }, p0/Z, [x22]\n"
- ".inst 0xc1301351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
- ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1311371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
- ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1301171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc13211b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1311191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc13011b1 // bfdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 1 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z22.s }, p0/Z, [x17]\n"
- ".inst 0x658aaad5 // bfcvt z21.h, p2/M, z22.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #9\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z3.s }, p0/Z, [x22]\n"
- ".inst 0x648aa875 // bfcvtnt z21.h, p2/M, z3.s\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
- ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z25.s }, p0/Z, [x22]\n"
- ".inst 0x648aab36 // bfcvtnt z22.h, p2/M, z25.s\n"
- "mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z24.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab17 // bfcvt z23.h, p2/M, z24.s\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z0.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa817 // bfcvtnt z23.h, p2/M, z0.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z7.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa8f8 // bfcvt z24.h, p2/M, z7.s\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z28.s }, p0/Z, [x22]\n"
- ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa2d // bfcvtnt z13.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z6.s }, p0/Z, [x22]\n"
- ".inst 0x658aa8d9 // bfcvt z25.h, p2/M, z6.s\n"
- "addvl x21, SP, #3\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc13112b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
+ ".inst 0x658aaa0e // bfcvt z14.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z6.s }, p0/Z, [x22]\n"
- ".inst 0x648aa8d9 // bfcvtnt z25.h, p2/M, z6.s\n"
- "addvl x20, SP, #9\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc13912d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381170 // bfdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ ".inst 0xc1391190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "ld1w { z3.s }, p0/Z, [x22]\n"
- ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- ".inst 0x658aa87a // bfcvt z26.h, p2/M, z3.s\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ ".inst 0xc1301171 // bfdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc13211b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ ".inst 0xc1311191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc13011b1 // bfdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 0 priming loads
"cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z9.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 20f\n"
"mov x12, #0x0\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z25.s }, p0/Z, [x17]\n"
- ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
- "add x20, x17, %x[ld_in_row], LSL #2\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "and x7, x7, #0x1\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x26, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "sub x16, x16, x26\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x658aab76 // bfcvt z22.h, p2/M, z27.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa01 // bfcvt z1.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa02 // bfcvt z2.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z25.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab37 // bfcvtnt z23.h, p2/M, z25.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa22 // bfcvtnt z2.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
"mov x12, #0x8\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa23 // bfcvtnt z3.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa04 // bfcvt z4.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z26.s }, p0/Z, [x20]\n"
- ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa24 // bfcvtnt z4.h, p2/M, z17.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
- "sub x7, x7, #0x2\n"
- "sub x16, x16, #0x1\n"
- "lsr x20, x7, #0x1\n"
- "cmp x20, x16\n"
- "csel x24, x20, x16, LT\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- "and x7, x7, #0x1\n"
- "sub x16, x16, x24\n"
- "cbz x24, 19f\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa05 // bfcvt z5.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa45 // bfcvtnt z5.h, p2/M, z18.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
+ "cbz x26, 19f\n"
"18:" // Padded: Main loop
- ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
- "addvl x23, SP, #6\n"
- "addvl x21, SP, #12\n"
- ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
"mov x12, #0x0\n"
+ "add x25, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1w { z9.s }, p0/Z, [x17]\n"
- "add x20, x17, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
"addvl x22, SP, #3\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ "ld1w { z18.s }, p0/Z, [x17]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
"add x17, x17, %x[ld_in_col], LSL #2\n"
- ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa0402b0a // ld1h { z10.h-z11.h }, pn10.b/Z, [x24]\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p0/Z, [x25]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z10.h\n"
+ ".inst 0xc1391070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [x25]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
- ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z11.h\n"
+ ".inst 0xa0402aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23]\n"
+ "ld1w { z17.s }, p0/Z, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc13a1032 // bfdot za.s[x8, 2], { z1.h-z4.h }, z10.h\n"
+ ".inst 0x658aaa52 // bfcvt z18.h, p2/M, z18.s\n"
+ "ld1w { z16.s }, p0/Z, [x25]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aab62 // bfcvt z2.h, p2/M, z27.s\n"
- "ld1w { z27.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1391071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc1bfcbcc // fclamp { z12.s-z15.s }, z30.s, z31.s\n"
+ "ld1w { z22.s }, p0/Z, [x25]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aa9c1 // bfcvtnt z1.h, p2/M, z14.s\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
- ".inst 0x658aa923 // bfcvt z3.h, p2/M, z9.s\n"
- "addvl x21, SP, #9\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1052 // bfdot za.s[x8, 2], { z2.h-z5.h }, z11.h\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ ".inst 0x648aa812 // bfcvtnt z18.h, p2/M, z0.s\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "st1w { z12.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ "ld1w { z11.s }, p0/Z, [x25]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aa924 // bfcvt z4.h, p2/M, z9.s\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ "st1w { z13.s }, p1, [x14]\n"
"mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa9e2 // bfcvtnt z2.h, p2/M, z15.s\n"
+ ".inst 0xc1391072 // bfdot za.s[x8, 2], { z3.h-z6.h }, z9.h\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x648aaa33 // bfcvtnt z19.h, p2/M, z17.s\n"
+ "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z14.s }, p1, [x10]\n"
+ "ld1w { z16.s }, p0/Z, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aab63 // bfcvtnt z3.h, p2/M, z27.s\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa975 // bfcvt z21.h, p2/M, z11.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z15.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ ".inst 0x648aaad4 // bfcvtnt z20.h, p2/M, z22.s\n"
+ "ld1w { z17.s }, p0/Z, [x25]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aab04 // bfcvtnt z4.h, p2/M, z24.s\n"
- ".inst 0x658aa925 // bfcvt z5.h, p2/M, z9.s\n"
- ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- ".inst 0x648aabc5 // bfcvtnt z5.h, p2/M, z30.s\n"
- ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x25]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x25, x25, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa36 // bfcvt z22.h, p2/M, z17.s\n"
"mov x12, #0x0\n"
+ ".inst 0xc1301250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z0.h\n"
+ "ld1w { z16.s }, p0/Z, [x25]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "ld1w { z0.s }, p0/Z, [x17]\n"
- "add x20, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa936 // bfcvtnt z22.h, p2/M, z9.s\n"
+ "ld1w { z9.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc1361031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
- "ld1w { z10.s }, p0/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z8.h\n"
+ ".inst 0xa0402aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aaba6 // bfcvt z6.h, p2/M, z29.s\n"
- "ld1w { z9.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1251 // bfdot za.s[x8, 1], { z18.h-z21.h }, z12.h\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13e1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
- "mov x12, #0x4\n"
- "ld1w { z29.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa815 // bfcvt z21.h, p2/M, z0.s\n"
+ ".inst 0xc1321290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z2.h\n"
+ "mov x12, #0x4\n"
+ "ld1h { z12.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z31.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa936 // bfcvt z22.h, p2/M, z9.s\n"
+ ".inst 0xc13d1271 // bfdot za.s[x8, 1], { z19.h-z22.h }, z13.h\n"
+ ".inst 0x658aaa02 // bfcvt z2.h, p2/M, z16.s\n"
+ ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
+ ".inst 0x648aa881 // bfcvtnt z1.h, p2/M, z4.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1301070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
- "subs x24, x24, #0x1\n"
- "ld1w { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1291 // bfdot za.s[x8, 1], { z20.h-z23.h }, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aaa22 // bfcvtnt z2.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1acc9b8 // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
- "ld1w { z30.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "st1w { z24.s }, p1, [x15]\n"
- "mov x12, #0x8\n"
- "ld1w { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
- "st1w { z25.s }, p1, [x14]\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa23 // bfcvtnt z3.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1301071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
- ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
- "ld1w { z8.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa04 // bfcvt z4.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
- "ld1w { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa24 // bfcvtnt z4.h, p2/M, z17.s\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x658aa919 // bfcvt z25.h, p2/M, z8.s\n"
- "ld1w { z5.s }, p0/Z, [x20]\n"
- "add x15, x15, x13, LSL #2\n"
- "add x14, x14, x11, LSL #2\n"
- "st1w { z26.s }, p1, [x10]\n"
- "add x10, x10, x28, LSL #2\n"
- "st1w { z27.s }, p1, [x9]\n"
- "add x9, x9, x27, LSL #2\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0x648aa955 // bfcvtnt z21.h, p2/M, z10.s\n"
- ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
- ".inst 0x648aa9f7 // bfcvtnt z23.h, p2/M, z15.s\n"
- ".inst 0x648aa9d8 // bfcvtnt z24.h, p2/M, z14.s\n"
- ".inst 0x648aa899 // bfcvtnt z25.h, p2/M, z4.s\n"
- ".inst 0x658aa8ba // bfcvt z26.h, p2/M, z5.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa05 // bfcvt z5.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa45 // bfcvtnt z5.h, p2/M, z18.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
"bgt 18b\n"
"19:" // Main loop tail
- ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
- "addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
- ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
- ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ "add x24, x17, %x[ld_in_row], LSL #2\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1w { z5.s }, p0/Z, [x17]\n"
- "add x22, x17, %x[ld_in_row], LSL #2\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x23, SP, #6\n"
+ "addvl x22, SP, #12\n"
"addvl x21, SP, #3\n"
"addvl x20, SP, #9\n"
- ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
- "ld1w { z29.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p0/Z, [x17]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0402aec // ld1h { z12.h-z13.h }, pn10.b/Z, [x23]\n"
+ "ld1w { z14.s }, p0/Z, [x24]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z2.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z12.h\n"
+ ".inst 0xc1391070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z9.h\n"
+ "ld1h { z7.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
"mov x12, #0x4\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
- ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
- ".inst 0x658aa8bb // bfcvt z27.h, p2/M, z5.s\n"
- "ld1w { z20.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z13.h\n"
+ ".inst 0xa0402acc // ld1h { z12.h-z13.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z15.s }, p0/Z, [x24]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "ld1w { z1.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc13c1032 // bfdot za.s[x8, 2], { z1.h-z4.h }, z12.h\n"
+ ".inst 0x658aaa4c // bfcvt z12.h, p2/M, z18.s\n"
+ "ld1w { z19.s }, p0/Z, [x24]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x658aa85c // bfcvt z28.h, p2/M, z2.s\n"
- "ld1w { z14.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z7.h\n"
+ "ld1h { z7.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1bfcbc8 // fclamp { z8.s-z11.s }, z30.s, z31.s\n"
+ "ld1w { z18.s }, p0/Z, [x24]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0x648aabbb // bfcvtnt z27.h, p2/M, z29.s\n"
- "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
- ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
- ".inst 0x658aa83d // bfcvt z29.h, p2/M, z1.s\n"
- "add x17, x17, %x[ld_in_col], LSL #2\n"
- "ld1w { z1.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1052 // bfdot za.s[x8, 2], { z2.h-z5.h }, z13.h\n"
+ ".inst 0x658aaa2d // bfcvt z13.h, p2/M, z17.s\n"
+ ".inst 0x648aa9cc // bfcvtnt z12.h, p2/M, z14.s\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "st1w { z8.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ "ld1w { z8.s }, p0/Z, [x24]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0x658aa83e // bfcvt z30.h, p2/M, z1.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ "st1w { z9.s }, p1, [x14]\n"
"mov x12, #0x8\n"
- "ld1w { z31.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aaa9c // bfcvtnt z28.h, p2/M, z20.s\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z26.s }, p0/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa9dd // bfcvtnt z29.h, p2/M, z14.s\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0x648aabfe // bfcvtnt z30.h, p2/M, z31.s\n"
- ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
- ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
- "ld1w { z9.s }, p0/Z, [x22]\n"
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1371072 // bfdot za.s[x8, 2], { z3.h-z6.h }, z7.h\n"
+ ".inst 0x658aaa6e // bfcvt z14.h, p2/M, z19.s\n"
"add x8, x8, #0x1\n"
- ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
- ".inst 0xc1321370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z26.s }, p0/Z, [x22]\n"
- ".inst 0xc13a1390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0x658aab40 // bfcvt z0.h, p2/M, z26.s\n"
- ".inst 0xc1321371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
- ".inst 0xc13a1391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
- "st1w { z4.s }, p1, [x15]\n"
- "add x15, x15, x13, LSL #2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc13913b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1w { z5.s }, p1, [x14]\n"
+ ".inst 0x648aa9ed // bfcvtnt z13.h, p2/M, z15.s\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z6.s }, p1, [x10]\n"
+ "st1w { z10.s }, p1, [x10]\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa90f // bfcvt z15.h, p2/M, z8.s\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
- "st1w { z7.s }, p1, [x9]\n"
+ "st1w { z11.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
- ".inst 0xc13913b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
- "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ ".inst 0x648aaa4e // bfcvtnt z14.h, p2/M, z18.s\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa2f // bfcvtnt z15.h, p2/M, z17.s\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p0/Z, [x24]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301190 // bfdot za.s[x8, 0], { z12.h-z15.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ ".inst 0x658aa931 // bfcvt z17.h, p2/M, z9.s\n"
+ ".inst 0xc13111b0 // bfdot za.s[x8, 0], { z13.h-z16.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1301191 // bfdot za.s[x8, 1], { z12.h-z15.h }, z0.h\n"
+ ".inst 0xc13211d0 // bfdot za.s[x8, 0], { z14.h-z17.h }, z2.h\n"
+ "ld1h { z5.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13111b1 // bfdot za.s[x8, 1], { z13.h-z16.h }, z1.h\n"
+ ".inst 0xa1402be0 // ld1h { z0.h, z8.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc13511d1 // bfdot za.s[x8, 1], { z14.h-z17.h }, z5.h\n"
+ "ld1h { z9.h }, p2/Z, [SP, #2, MUL VL]\n"
"20:" // Main loop skip tail
"cbz x7, 21f\n" // Skip remainder inputs
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z25.s }, p0/Z, [x17]\n"
- ".inst 0x658aab3d // bfcvt z29.h, p2/M, z25.s\n"
"add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x21, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
+ "ld1w { z16.s }, p0/Z, [x17]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z26.s }, p0/Z, [x22]\n"
- ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
- "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z25.s }, p0/Z, [x22]\n"
- ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1w { z24.s }, p0/Z, [x22]\n"
- ".inst 0x648aab1e // bfcvtnt z30.h, p2/M, z24.s\n"
- "mov x12, #0x4\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x4\n"
+ ".inst 0x648aaa32 // bfcvtnt z18.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z26.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
+ ".inst 0x648aaa33 // bfcvtnt z19.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1w { z9.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
+ ".inst 0x658aabb4 // bfcvt z20.h, p2/M, z29.s\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "mov x12, #0x8\n"
- "ld1w { z24.s }, p0/Z, [x22]\n"
- ".inst 0x648aab00 // bfcvtnt z0.h, p2/M, z24.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa34 // bfcvtnt z20.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1w { z9.s }, p0/Z, [x22]\n"
- ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "ld1w { z13.s }, p0/Z, [x22]\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1w { z25.s }, p0/Z, [x22]\n"
- ".inst 0x648aab21 // bfcvtnt z1.h, p2/M, z25.s\n"
- ".inst 0xc13313b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
- "addvl x21, SP, #6\n"
"add x22, x22, %x[ld_in_row], LSL #2\n"
- ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ ".inst 0x658aa9b6 // bfcvt z22.h, p2/M, z13.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "addvl x20, SP, #12\n"
- ".inst 0xc13e13b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
- "ld1w { z25.s }, p0/Z, [x22]\n"
- ".inst 0x658aab22 // bfcvt z2.h, p2/M, z25.s\n"
- "sub x16, x16, #0x1\n"
- ".inst 0xc13f13d1 // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc13e13b2 // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
- ".inst 0xc13713f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
- "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc13f13d2 // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
- ".inst 0xc13413f1 // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
- "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z0.h\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa36 // bfcvtnt z22.h, p2/M, z17.s\n"
+ ".inst 0x658aa9d7 // bfcvt z23.h, p2/M, z14.s\n"
+ ".inst 0xc1381270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z8.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1301251 // bfdot za.s[x8, 1], { z18.h-z21.h }, z0.h\n"
+ ".inst 0xc1391290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z9.h\n"
+ "ld1h { z14.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1311271 // bfdot za.s[x8, 1], { z19.h-z22.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
- ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ ".inst 0xc1301252 // bfdot za.s[x8, 2], { z18.h-z21.h }, z0.h\n"
+ ".inst 0xc13e1291 // bfdot za.s[x8, 1], { z20.h-z23.h }, z14.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1bfcbc4 // fclamp { z4.s-z7.s }, z30.s, z31.s\n"
+ ".inst 0xc1311272 // bfdot za.s[x8, 2], { z19.h-z22.h }, z1.h\n"
"st1w { z4.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc13913f2 // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
- "add x8, x8, #0x1\n"
"st1w { z5.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
+ ".inst 0xc1301292 // bfdot za.s[x8, 2], { z20.h-z23.h }, z0.h\n"
+ "add x8, x8, #0x1\n"
"st1w { z6.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
"st1w { z7.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
"21:" // Tail input: End
"cbz x16, 23f\n"
"22:" // Right padding loop
- ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
"subs x16, x16, #0x1\n"
- ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
- "st1w { z4.s }, p1, [x15]\n"
+ ".inst 0xc0040f04 // mova za.d[x8, #4], { z24.d-z27.d }\n"
+ ".inst 0xc1bfcbd0 // fclamp { z16.s-z19.s }, z30.s, z31.s\n"
+ "st1w { z16.s }, p1, [x15]\n"
"add x15, x15, x13, LSL #2\n"
- ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
- "st1w { z5.s }, p1, [x14]\n"
+ "st1w { z17.s }, p1, [x14]\n"
"add x14, x14, x11, LSL #2\n"
- "st1w { z6.s }, p1, [x10]\n"
+ "st1w { z18.s }, p1, [x10]\n"
"add x10, x10, x28, LSL #2\n"
- "st1w { z7.s }, p1, [x9]\n"
+ "st1w { z19.s }, p1, [x9]\n"
"add x9, x9, x27, LSL #2\n"
"bgt 22b\n"
"23:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
"incb x20, ALL, MUL #16\n"
"incb x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x6\n"
- "whilelt p1.s, x6, x5\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21, LSL #2\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1234,6 +1239,8 @@ void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 845f376926..f5a4583d74 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,133 +70,138 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x6\n"
"ptrue p2.b\n"
- "mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z20.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-12\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z20.h, p2/M, z20.h\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-12\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z21.h, p2/M, z21.h\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z30.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z23.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z25.h, #0x0\n"
+ "addvl x22, SP, #12\n"
+ "addvl x22, x22, #-4\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z7.h, #0x0\n"
- "sub z10.h, z10.h, z31.h\n"
- "incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z16.h, z16.h, z31.h\n"
- "trn1 z20.h, z7.h, z10.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z31.h\n"
- "mov x20, x22\n"
- "trn1 z19.h, z10.h, z16.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z0.h, z0.h, z23.h\n"
+ "sub z26.h, z26.h, z23.h\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "trn1 z14.h, z25.h, z0.h\n"
+ "trn1 z2.h, z0.h, z26.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z26.h, z16.h, z11.h\n"
- "trn1 z13.h, z11.h, z7.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z26.h, z15.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z31.h\n"
- "sub z11.h, z11.h, z31.h\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
- "sub z2.h, z2.h, z31.h\n"
- "addvl x21, SP, #12\n"
- "incw x22\n"
- "addvl x21, x21, #-4\n"
- "mov x20, x22\n"
- "st1h { z20.h }, p2, [x21]\n"
- "trn1 z22.h, z7.h, z24.h\n"
- "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z24.h, z11.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z25.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z21.h, z21.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z23.h\n"
+ "st1h { z2.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z23.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z3.h, z25.h, z21.h\n"
+ "trn1 z14.h, z21.h, z1.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z3.h, z11.h, z2.h\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "trn1 z10.h, z1.h, z11.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z25.h, z2.h, z7.h\n"
- "ld1sb { z4.s }, p2/Z, [x20]\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z31.h\n"
- "sub z0.h, z0.h, z31.h\n"
- "addvl x21, x21, #-4\n"
- "st1h { z22.h }, p2, [x21]\n"
- "sub z4.h, z4.h, z31.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z30.d\n"
- "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z24.h, z7.h, z16.h\n"
- "trn1 z18.h, z16.h, z0.h\n"
- "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #-4\n"
- "trn1 z0.h, z0.h, z4.h\n"
- "trn1 z1.h, z4.h, z7.h\n"
- "st1h { z24.h }, p2, [x21]\n"
- "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z26.h, z11.h, z25.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "st1h { z3.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z16.h, z16.h, z23.h\n"
+ "st1h { z10.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z22.h, z25.h, z15.h\n"
+ "trn1 z6.h, z15.h, z9.h\n"
+ "trn1 z12.h, z9.h, z16.h\n"
+ "trn1 z11.h, z16.h, z25.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z6.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z11.h }, p2, [x22, #3, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z5.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x6\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040b82 // mova za.d[x8, #2], { z28.d-z29.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ ".inst 0xc0040b83 // mova za.d[x8, #3], { z28.d-z29.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
"ldp x27, x26, [x23], #0x10\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +209,22 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +236,148 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z20.s }, p1/Z, [x14]\n"
+ "ld1sb { z27.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z20.h, z16.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1sb { z23.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z5.h, z23.h, z22.h\n"
- "add z5.h, z5.h, z21.h\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "trn1 z6.h, z17.h, z16.h\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
- ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
- ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
- ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z27.h, z16.h\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "trn1 z16.h, z3.h, z1.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z15.h, z15.h, z20.h\n"
+ "trn1 z17.h, z12.h, z18.h\n"
+ "add z16.h, z16.h, z20.h\n"
+ "add z17.h, z17.h, z20.h\n"
+ ".inst 0xc16b15e8 // sdot za.s[x8, 0], { z15.h-z16.h }, z11.h\n"
+ ".inst 0xc16a15e9 // sdot za.s[x8, 1], { z15.h-z16.h }, z10.h\n"
+ ".inst 0xc1631608 // sdot za.s[x8, 0], { z16.h-z17.h }, z3.h\n"
+ ".inst 0xc1621609 // sdot za.s[x8, 1], { z16.h-z17.h }, z2.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1sb { z25.s }, p1/Z, [x14]\n"
+ "ld1sb { z22.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1sb { z6.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z3.h, z25.h, z6.h\n"
- "add z3.h, z3.h, z21.h\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1sb { z26.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z4.h, z18.h, z26.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1sb { z2.s }, p1/Z, [x22]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z5.s }, p1/Z, [x22]\n"
- "trn1 z5.h, z2.h, z5.h\n"
- "add z5.h, z5.h, z21.h\n"
+ "ld1sb { z10.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z22.h, z16.h\n"
+ "ld1sb { z7.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z19.h, z10.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
- ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
- ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
- ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
- ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
- ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
- ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z21.h, z21.h, z20.h\n"
+ "trn1 z23.h, z11.h, z7.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc16116a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16016a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z0.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16616c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16916ca // sdot za.s[x8, 2], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cb // sdot za.s[x8, 3], { z22.h-z23.h }, z1.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "ld1sb { z15.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z9.s }, p1/Z, [x20]\n"
+ "ld1sb { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z6.h, z6.h, z21.h\n"
- "ld1sb { z7.s }, p1/Z, [x20]\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "ld1sb { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z7.h, z7.h, z21.h\n"
+ "trn1 z21.h, z15.h, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z1.s }, p1/Z, [x20]\n"
- "trn1 z8.h, z17.h, z1.h\n"
- "add z8.h, z8.h, z21.h\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
"sub x13, x13, x23\n"
+ "trn1 z22.h, z24.h, z9.h\n"
+ "trn1 z23.h, z2.h, z15.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1sb { z2.s }, p1/Z, [x14]\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z26.s }, p1/Z, [x14]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "ld1sb { z4.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "ld1sb { z23.s }, p1/Z, [x20]\n"
+ "ld1sb { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
- "ld1sb { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- "ld1sb { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z14.h\n"
+ "ld1sb { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- "trn1 z6.h, z2.h, z19.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xc16616ab // sdot za.s[x8, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ "ld1sb { z11.s }, p1/Z, [x20]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc16916ac // sdot za.s[x8, 4], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16116ad // sdot za.s[x8, 5], { z21.h-z22.h }, z1.h\n"
+ "trn1 z21.h, z26.h, z4.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16f16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc1a5ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z5.s\n"
+ ".inst 0xc16716cb // sdot za.s[x8, 3], { z22.h-z23.h }, z7.h\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
- "st1b { z24.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
- "trn1 z7.h, z23.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ ".inst 0xc1adaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc16916cc // sdot za.s[x8, 4], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cd // sdot za.s[x8, 5], { z22.h-z23.h }, z1.h\n"
+ "trn1 z22.h, z27.h, z3.h\n"
+ "trn1 z23.h, z25.h, z11.h\n"
"add x8, x8, #0x2\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z26.s }, p1, [x10]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc1bfcfd0 // sclamp { z16.s-z19.s }, z30.s, z31.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z17.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "add z7.h, z7.h, z21.h\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -382,258 +387,258 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1sb { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z19.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z9.h, z17.h, z16.h\n"
- ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
- ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ ".inst 0xc16c16e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16416e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z4.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16f1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1671709 // sdot za.s[x8, 1], { z24.h-z25.h }, z7.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1sb { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z16.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z19.h, z18.h\n"
- "trn1 z23.h, z17.h, z16.h\n"
+ "ld1sb { z10.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z17.h, z18.h, z10.h\n"
+ "add z14.h, p0/M, z14.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- "trn1 z24.h, z17.h, z16.h\n"
- ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
- ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
- ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc16f1608 // sdot za.s[x8, 0], { z16.h-z17.h }, z15.h\n"
+ "ld1sb { z10.s }, p0/Z, [x22]\n"
+ ".inst 0xc1671609 // sdot za.s[x8, 1], { z16.h-z17.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
+ ".inst 0xc16f160a // sdot za.s[x8, 2], { z16.h-z17.h }, z15.h\n"
+ ".inst 0xc167160b // sdot za.s[x8, 3], { z16.h-z17.h }, z7.h\n"
+ "trn1 z18.h, z14.h, z10.h\n"
+ ".inst 0xc16c1628 // sdot za.s[x8, 0], { z17.h-z18.h }, z12.h\n"
+ ".inst 0xc1641629 // sdot za.s[x8, 1], { z17.h-z18.h }, z4.h\n"
".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc161162a // sdot za.s[x8, 2], { z17.h-z18.h }, z1.h\n"
+ ".inst 0xc160162b // sdot za.s[x8, 3], { z17.h-z18.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "ld1sb { z17.s }, p0/Z, [x14]\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z19.h, z18.h\n"
- "trn1 z7.h, z17.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "sub x13, x13, #0x1\n"
- "cmp x15, x13\n"
- "trn1 z8.h, z17.h, z16.h\n"
- "csel x23, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x23\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z9.s }, p0/Z, [x14]\n"
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "subs x23, x23, #0x1\n"
+ "ld1sb { z16.s }, p0/Z, [x14]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x22]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
"ld1sb { z18.s }, p0/Z, [x22]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16f16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16716cd // sdot za.s[x8, 5], { z22.h-z23.h }, z7.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z21.h, z16.h, z17.h\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
- "subs x23, x23, #0x1\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"ld1sb { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- "ld1sb { z2.s }, p0/Z, [x22]\n"
- ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
- "add z2.h, p0/M, z2.h, z21.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z6.h, z9.h, z19.h\n"
- ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "trn1 z7.h, z18.h, z16.h\n"
- "trn1 z8.h, z17.h, z2.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- "st1b { z24.s }, p1, [x11]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc16c16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc16416cd // sdot za.s[x8, 5], { z22.h-z23.h }, z4.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
- ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a5ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1adaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1bfcfd8 // sclamp { z24.s-z27.s }, z30.s, z31.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -652,6 +657,8 @@ void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 1d0efc6bc1..a3cfa94b03 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,119 +70,124 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x9\n"
"ptrue p2.b\n"
- "mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z29.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-6\n"
+ "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z29.h, p2/M, z29.h\n"
+ "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-6\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z11.h, p2/M, z11.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z22.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z27.h, #0x0\n"
+ "addvl x22, SP, #6\n"
+ "addvl x22, x22, #-2\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z17.d, z16.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z26.h, z26.h, z16.h\n"
- "incw x22\n"
- "mov z24.h, #0x0\n"
- "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z3.h, z3.h, z16.h\n"
- "trn1 z31.h, z26.h, z3.h\n"
- "ld1sb { z21.s }, p2/Z, [x20]\n"
- "sub z21.h, z21.h, z16.h\n"
- "mov x20, x22\n"
- "trn1 z14.h, z21.h, z24.h\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z25.h, z25.h, z22.h\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "sub z9.h, z9.h, z22.h\n"
+ "trn1 z24.h, z25.h, z15.h\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z2.h, z2.h, z16.h\n"
- "addvl x21, SP, #6\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z16.h\n"
- "incw x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
- "sub z27.h, z27.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "mov x20, x22\n"
- "st1h { z31.h }, p2, [x21]\n"
- "trn1 z4.h, z2.h, z25.h\n"
- "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "trn1 z11.h, z9.h, z27.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z12.h, z12.h, z22.h\n"
+ "sub z4.h, z4.h, z22.h\n"
+ "st1h { z24.h }, p2, [x22]\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "st1h { z11.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z9.h, z12.h, z4.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z12.h, z27.h, z24.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
- "sub z26.h, z26.h, z16.h\n"
- "sub z23.h, z23.h, z16.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z20.h, z20.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "st1h { z4.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
- "addvl x21, x21, #-2\n"
- "mov z30.d, z28.d\n"
- "mov z31.d, z28.d\n"
- "trn1 z25.h, z26.h, z23.h\n"
- "st1h { z25.h }, p2, [x21]\n"
- "trn1 z3.h, z20.h, z24.h\n"
- "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z21.h, z15.h, z27.h\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z22.h\n"
+ "sub z10.h, z10.h, z22.h\n"
+ "st1h { z9.h }, p2, [x22]\n"
+ "sub z30.h, z30.h, z22.h\n"
+ "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z15.h, z14.h, z10.h\n"
+ "trn1 z25.h, z30.h, z27.h\n"
+ "st1h { z15.h }, p2, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z1.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x9\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
@@ -191,24 +196,24 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1a8ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1bccfec // sclamp { z12.s-z15.s }, z31.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z12.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z25.s }, p1, [x10]\n"
+ "st1b { z13.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z26.s }, p1, [x27]\n"
+ "st1b { z14.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z15.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +225,194 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z1.s }, p1/Z, [x14]\n"
+ "ld1sb { z23.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
- "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z15.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1sb { z3.s }, p1/Z, [x21]\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z21.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1sb { z4.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ "ld1sb { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z19.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1sb { z8.s }, p1/Z, [x21]\n"
- "mov z5.d, z8.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z6.h\n"
+ "ld1sb { z10.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z29.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ "mov z27.d, z10.d\n"
+ "add z25.h, z25.h, z29.h\n"
+ "add z26.h, z26.h, z29.h\n"
+ "add z27.h, z27.h, z29.h\n"
+ ".inst 0xc17616e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z6.h\n"
+ ".inst 0xc17e1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z14.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z1.s }, p1/Z, [x14]\n"
+ "ld1sb { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1sb { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
"ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z12.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1sb { z3.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "trn1 z20.h, z20.h, z2.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z8.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1sb { z4.s }, p1/Z, [x21]\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z5.s }, p1/Z, [x21]\n"
+ "trn1 z21.h, z21.h, z25.h\n"
+ "ld1sb { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z5.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1sb { z5.s }, p1/Z, [x21]\n"
- "mov z5.d, z5.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
+ "add z20.h, z20.h, z29.h\n"
+ ".inst 0xa0402a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z9.h\n"
+ "add z21.h, z21.h, z29.h\n"
+ "mov z24.d, z3.d\n"
+ "add z22.h, z22.h, z29.h\n"
+ "add z23.h, z23.h, z29.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ ".inst 0xc1761688 // sdot za.s[x8, 0], { z20.h-z23.h }, z6.h\n"
+ ".inst 0xc17716a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z7.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1sb { z21.s }, p1/Z, [x14]\n"
+ "ld1sb { z10.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1sb { z23.s }, p1/Z, [x21]\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
"csel x23, x20, x13, LT\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1sb { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z18.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1sb { z24.s }, p1/Z, [x21]\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z19.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1sb { z8.s }, p1/Z, [x21]\n"
- "mov z25.d, z8.d\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z24.h\n"
"and x15, x15, #0x1\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"sub x13, x13, x23\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z30.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1sb { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1771549 // sdot za.s[x8, 1], { z10.h-z13.h }, z7.h\n"
+ "ld1sb { z3.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x20, x14, %x[ld_in_row]\n"
- "ld1sb { z18.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "ld1sb { z9.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z21.h, z21.h, z11.h\n"
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- "ld1sb { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f1569 // sdot za.s[x8, 1], { z11.h-z14.h }, z15.h\n"
+ "ld1sb { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z8.h\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1sb { z23.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z3.h, z9.h\n"
+ "ld1sb { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ld1sb { z27.s }, p1/Z, [x22]\n"
+ "ld1sb { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1sb { z24.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a1ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z1.s\n"
+ "ld1sb { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- "ld1sb { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1sb { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z8.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1sb { z4.s }, p1/Z, [x22]\n"
- "mov z25.d, z4.d\n"
- "add z25.h, z25.h, z11.h\n"
- ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
- "ld1sb { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- "ld1sb { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z12.h\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "trn1 z4.h, z4.h, z15.h\n"
+ "add z3.h, z3.h, z29.h\n"
+ "ld1sb { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z5.h, z5.h, z10.h\n"
+ "ld1sb { z21.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a0aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ "trn1 z6.h, z6.h, z14.h\n"
+ "add z4.h, z4.h, z29.h\n"
+ "mov z7.d, z21.d\n"
+ "add z5.h, z5.h, z29.h\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ "add z6.h, z6.h, z29.h\n"
+ "add z7.h, z7.h, z29.h\n"
+ ".inst 0xc1bccff8 // sclamp { z24.s-z27.s }, z31.s, z28.s\n"
+ ".inst 0xc17a1468 // sdot za.s[x8, 0], { z3.h-z6.h }, z10.h\n"
+ "ld1sb { z10.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z0.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc17b1488 // sdot za.s[x8, 0], { z4.h-z7.h }, z11.h\n"
+ "ld1sb { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z20.h\n"
- "st1b { z1.s }, p1, [x10]\n"
- "ld1sb { z23.s }, p1/Z, [x20]\n"
+ "trn1 z10.h, z10.h, z22.h\n"
+ "ld1sb { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
- "ld1sb { z24.s }, p1/Z, [x20]\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z24.h\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "ld1sb { z24.s }, p1/Z, [x20]\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1sb { z3.s }, p1/Z, [x20]\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z3.h\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1sb { z3.s }, p1/Z, [x20]\n"
- "mov z25.d, z3.d\n"
- "add z22.h, z22.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "add z23.h, z23.h, z11.h\n"
- "add z24.h, z24.h, z11.h\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z14.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "ld1sb { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "trn1 z13.h, z13.h, z6.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -417,440 +422,440 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z20.h, z20.h, z22.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z4.h\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z23.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
- "addvl x20, SP, #4\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z1.d\n"
- ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
+ ".inst 0xc1731688 // sdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+ "mov z24.d, z24.d\n"
+ ".inst 0xc17b16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z5.s }, p0/Z, [x20]\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z5.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
- "addvl x20, SP, #2\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z15.d\n"
- ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z25.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ ".inst 0xc17316a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "mov z25.d, z20.d\n"
+ ".inst 0xc17b16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
- "mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "trn1 z22.h, z22.h, z3.h\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z23.h, z23.h, z19.h\n"
- "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z25.d, z3.d\n"
- "csel x22, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
- "16:" // Padded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
- "mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
- "ld1sb { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1sb { z10.s }, p0/Z, [x14]\n"
+ "csel x23, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
- "ld1sb { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
+ "ld1sb { z11.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z20.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x21]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z20.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z24.s }, p0/Z, [x21]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z4.s }, p0/Z, [x21]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z14.h\n"
- "trn1 z22.h, z22.h, z15.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x20, SP, #2\n"
- "ld1sb { z2.s }, p0/Z, [x21]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z4.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z21.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1sb { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x20, x14, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z25.d, z2.d\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1sb { z26.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1sb { z11.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z26.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1sb { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z5.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z11.h, z11.h, z9.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1sb { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z11.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "trn1 z21.h, z21.h, z20.h\n"
- "st1b { z17.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z22.h, z22.h, z4.h\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "st1b { z18.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
- "trn1 z24.h, z24.h, z12.h\n"
- "mov z25.d, z8.d\n"
- "st1b { z19.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- "add x14, x14, %x[ld_in_col]\n"
- "bgt 16b\n"
- "17:" // Main loop tail
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z9.s }, p0/Z, [x22]\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1sb { z0.s }, p0/Z, [x14]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "ld1sb { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "mov z14.d, z9.d\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "ld1sb { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
+ "ld1sb { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z2.s }, p0/Z, [x20]\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z15.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z3.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z4.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x22, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "trn1 z0.h, z0.h, z14.h\n"
- "add x8, x8, #0x1\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "trn1 z1.h, z1.h, z12.h\n"
- "trn1 z2.h, z2.h, z21.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1sb { z9.s }, p0/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z3.h, z3.h, z25.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "mov z4.d, z27.d\n"
- ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1sb { z10.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z15.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "ld1sb { z11.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "trn1 z10.h, z10.h, z15.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z5.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z5.s }, p0/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z15.h\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0xc1721528 // sdot za.s[x8, 0], { z9.h-z12.h }, z2.h\n"
+ "mov z13.d, z5.d\n"
+ ".inst 0xc1731548 // sdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #4\n"
+ "sub x13, x13, #0x1\n"
"ld1sb { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z15.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z17.h\n"
- "trn1 z22.h, z22.h, z0.h\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z12.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z30.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z30.h, p0/M, z30.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z5.h\n"
- "mov z25.d, z4.d\n"
- "addvl x20, SP, #4\n"
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z6.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "add z6.h, p0/M, z6.h, z29.h\n"
+ ".inst 0xc17216a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "mov z25.d, z6.d\n"
+ ".inst 0xc17316c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17516a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z5.h\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc17d16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z13.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
- "st1b { z0.s }, p1, [x11]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z1.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -869,6 +874,8 @@ void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index bb68733a45..857d20ab09 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,249 +70,254 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "ptrue p2.b\n"
+ "mov x22, SP\n"
"mov x20, #0x8\n"
+ "ptrue p2.b\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x5\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x22, #0x8\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x21, x21, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x20, x5\n"
+ "mov SP, x21\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-30\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "neg z15.h, p2/M, z15.h\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x6\n"
- "addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z17.h, p2/M, z17.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z18.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x23\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z13.h, #0x0\n"
+ "addvl x22, SP, #30\n"
+ "addvl x22, x22, #-6\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x23, x24\n"
+ "incw x24\n"
+ "ld1sb { z22.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z19.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "ld1sb { z5.s }, p2/Z, [x23]\n"
+ "mov x20, x24\n"
+ "incw x24\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "trn1 z6.h, z13.h, z22.h\n"
+ "trn1 z23.h, z22.h, z21.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z15.h, #0x0\n"
- "sub z2.h, z2.h, z3.h\n"
- "incw x23\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "trn1 z4.h, z21.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z13.h, z13.h, z3.h\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z19.h, z25.h\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z27.h, z27.h, z3.h\n"
- "trn1 z0.h, z2.h, z13.h\n"
+ "trn1 z22.h, z25.h, z5.h\n"
+ "ld1sb { z7.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z25.h, z5.h, z13.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z18.h, z18.h, z0.h\n"
+ "st1h { z6.h }, p2, [x22]\n"
+ "incw x24\n"
+ "sub z7.h, z7.h, z0.h\n"
+ "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z1.h, z1.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z20.h, z13.h, z27.h\n"
+ "trn1 z12.h, z27.h, z9.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z2.h, z9.h, z18.h\n"
"ld1sb { z19.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z19.h, z19.h, z3.h\n"
- "trn1 z26.h, z13.h, z27.h\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z9.h, z18.h, z7.h\n"
"ld1sb { z14.s }, p2/Z, [x20]\n"
- "sub z14.h, z14.h, z3.h\n"
- "mov x20, x23\n"
- "trn1 z10.h, z27.h, z19.h\n"
- "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z19.h, z19.h, z14.h\n"
- "trn1 z1.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z18.h, z7.h, z1.h\n"
"ld1sb { z5.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z25.h, z1.h, z13.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "addvl x22, x22, #-6\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "mov x20, x24\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "st1h { z12.h }, p2, [x22, #1, MUL VL]\n"
+ "incw x24\n"
+ "st1h { z2.h }, p2, [x22, #2, MUL VL]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "trn1 z7.h, z13.h, z21.h\n"
+ "trn1 z20.h, z21.h, z19.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z29.h, z29.h, z3.h\n"
- "addvl x22, SP, #30\n"
+ "trn1 z17.h, z19.h, z14.h\n"
+ "st1h { z9.h }, p2, [x22, #3, MUL VL]\n"
"ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "incw x23\n"
- "sub z2.h, z2.h, z3.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z23.h, z23.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "trn1 z20.h, z15.h, z9.h\n"
- "incw x23\n"
- "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z22.h, z9.h, z5.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z9.h, z5.h, z29.h\n"
+ "trn1 z12.h, z14.h, z5.h\n"
+ "st1h { z18.h }, p2, [x22, #4, MUL VL]\n"
"ld1sb { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z26.h, z29.h, z2.h\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z28.h, z2.h, z23.h\n"
- "ld1sb { z19.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z23.h, z15.h\n"
- "sub z25.h, z25.h, z3.h\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "sub z21.h, z21.h, z3.h\n"
- "ld1sb { z6.s }, p2/Z, [x20]\n"
- "sub z0.h, z0.h, z3.h\n"
- "mov x20, x23\n"
- "sub z19.h, z19.h, z3.h\n"
- "sub z6.h, z6.h, z3.h\n"
- "st1h { z20.h }, p2, [x22]\n"
- "incw x23\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z11.h, z15.h, z25.h\n"
- "trn1 z10.h, z25.h, z21.h\n"
- "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "trn1 z5.h, z5.h, z16.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z14.h, z21.h, z0.h\n"
+ "trn1 z4.h, z16.h, z13.h\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "st1h { z7.h }, p2, [x22]\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "st1h { z20.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z1.h, z13.h, z6.h\n"
+ "trn1 z24.h, z6.h, z2.h\n"
"ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z21.h, z0.h, z19.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z19.h, z19.h, z6.h\n"
- "ld1sb { z29.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z13.h, z6.h, z15.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z23.h, z23.h, z3.h\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z27.h, z27.h, z3.h\n"
- "sub z29.h, z29.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "sub z1.h, z1.h, z3.h\n"
- "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z30.h, z15.h, z5.h\n"
- "trn1 z26.h, z5.h, z23.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z2.h, z21.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z22.h, z23.h, z27.h\n"
- "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "st1h { z12.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z20.h, z21.h, z25.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z28.h, z27.h, z29.h\n"
- "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "st1h { z5.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z17.h, z25.h, z19.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z27.h, z29.h, z1.h\n"
- "ld1sb { z9.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z1.h, z15.h\n"
- "ld1sb { z14.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z3.h\n"
+ "st1h { z4.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z19.h, z19.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z0.h\n"
"addvl x22, x22, #-6\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z8.h, z8.h, z3.h\n"
- "st1h { z30.h }, p2, [x22]\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z14.h, z14.h, z3.h\n"
- "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
- "mov z19.d, z18.d\n"
- "trn1 z22.h, z15.h, z11.h\n"
- "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z1.h, z11.h, z5.h\n"
- "trn1 z31.h, z5.h, z8.h\n"
- "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z8.h, z8.h, z9.h\n"
- "trn1 z21.h, z9.h, z14.h\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z1.h }, p2, [x22]\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z16.h, z13.h, z23.h\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z7.h, z23.h, z6.h\n"
+ "trn1 z12.h, z6.h, z14.h\n"
+ "st1h { z19.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "trn1 z15.h, z14.h, z15.h\n"
- "st1h { z22.h }, p2, [x22]\n"
- "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
- "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
- "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z5.h, z14.h, z22.h\n"
+ "trn1 z14.h, z22.h, z27.h\n"
+ "trn1 z20.h, z27.h, z13.h\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "st1h { z7.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z14.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #5, MUL VL]\n"
"cbz x21, 3f\n"
- "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "ld1w { z8.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x25, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x5\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x6, x5\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x6, x16\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x25, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x6, x16\n"
+ "orr x20, x7, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x16, x6, x20, x16\n"
- ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0046b80 // mova za.d[x11, #0], { z28.d-z29.d }\n"
"mov x22, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x6, x21, x16\n"
+ ".inst 0xc0046b81 // mova za.d[x11, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046b82 // mova za.d[x11, #2], { z28.d-z29.d }\n"
"ldp x14, x13, [x23], #0x10\n"
- ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ ".inst 0xc0046b83 // mova za.d[x11, #3], { z28.d-z29.d }\n"
"ldp x4, x10, [x20], #0x10\n"
- ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ ".inst 0xc0046b84 // mova za.d[x11, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0046b85 // mova za.d[x11, #5], { z28.d-z29.d }\n"
"ldp x9, x28, [x23], #0x10\n"
- ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ ".inst 0xc0046b86 // mova za.d[x11, #6], { z28.d-z29.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
- ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc0046b87 // mova za.d[x11, #7], { z28.d-z29.d }\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066810 // mova { z16.d-z17.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
- ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
- ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
- ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ ".inst 0xc0066832 // mova { z18.d-z19.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc1abaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc1becff0 // sclamp { z16.s-z19.s }, z31.s, z30.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z20.s }, p1, [x14]\n"
+ "st1b { z16.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z22.s }, p1, [x13]\n"
+ "st1b { z18.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z21.s }, p1, [x9]\n"
+ "st1b { z17.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z23.s }, p1, [x28]\n"
+ "st1b { z19.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -328,331 +333,331 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1sb { z1.s }, p1/Z, [x16]\n"
+ "ld1sb { z4.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z28.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z1.h, z28.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1sb { z1.s }, p1/Z, [x21]\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z1.h, z2.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z6.s }, p1/Z, [x21]\n"
+ "trn1 z22.h, z4.h, z13.h\n"
+ "ld1sb { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z13.h, z6.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z25.h, z19.h\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z14.h, z27.h\n"
"ld1sb { z20.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z30.h, z20.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z17.h\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16d76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16c76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z12.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16e76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1sb { z2.s }, p1/Z, [x16]\n"
+ "ld1sb { z21.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z28.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z20.h, z2.h, z28.h\n"
- "add z20.h, z20.h, z17.h\n"
- "ld1sb { z31.s }, p1/Z, [x22]\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z11.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z21.h, z31.h, z11.h\n"
- "add z21.h, z21.h, z17.h\n"
- "ld1sb { z25.s }, p1/Z, [x22]\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "ld1sb { z3.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z25.h, z8.h\n"
- "add z22.h, z22.h, z17.h\n"
- "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
- "ld1sb { z3.s }, p1/Z, [x22]\n"
- "trn1 z23.h, z8.h, z3.h\n"
- ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
- "add z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z21.h, z18.h\n"
+ "ld1sb { z7.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z3.h\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z26.h, z27.h, z7.h\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
- ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
- ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
- ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
- ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
- ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc16c770b // sdot za.s[x11, 3], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1697728 // sdot za.s[x11, 0], { z25.h-z26.h }, z9.h\n"
+ ".inst 0xc1617729 // sdot za.s[x11, 1], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc1677748 // sdot za.s[x11, 0], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc1667749 // sdot za.s[x11, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc16c774b // sdot za.s[x11, 3], { z26.h-z27.h }, z12.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1sb { z2.s }, p1/Z, [x16]\n"
+ "ld1sb { z0.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z22.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z0.h, z2.h, z22.h\n"
- "add z0.h, z0.h, z17.h\n"
- "ld1sb { z14.s }, p1/Z, [x23]\n"
+ "ld1sb { z19.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1sb { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z1.h, z14.h, z6.h\n"
- "add z1.h, z1.h, z17.h\n"
- "ld1sb { z15.s }, p1/Z, [x23]\n"
+ "ld1sb { z4.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1sb { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z2.h, z15.h, z6.h\n"
- "add z2.h, z2.h, z17.h\n"
- "ld1sb { z21.s }, p1/Z, [x23]\n"
+ "ld1sb { z3.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
- "ld1sb { z30.s }, p1/Z, [x23]\n"
- "trn1 z3.h, z21.h, z30.h\n"
- ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
- ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
- "add z3.h, z3.h, z17.h\n"
- ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
- ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
- ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
- ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
- ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
- ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
- ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
- ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "ld1sb { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z0.h, z19.h\n"
+ "ld1sb { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z4.h, z3.h\n"
+ "ld1sb { z9.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z17.h, z25.h\n"
+ "ld1sb { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "trn1 z25.h, z9.h, z17.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16576ca // sdot za.s[x11, 2], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cb // sdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16776e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16576cc // sdot za.s[x11, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cd // sdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xc16776ea // sdot za.s[x11, 2], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16576ec // sdot za.s[x11, 4], { z23.h-z24.h }, z5.h\n"
+ ".inst 0xc16476ed // sdot za.s[x11, 5], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc167770a // sdot za.s[x11, 2], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1422a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d770c // sdot za.s[x11, 4], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc165770d // sdot za.s[x11, 5], { z24.h-z25.h }, z5.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1sb { z0.s }, p1/Z, [x16]\n"
+ "ld1sb { z16.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1sb { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z0.h, z3.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1sb { z6.s }, p1/Z, [x24]\n"
+ "ld1sb { z22.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1sb { z30.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z6.h, z30.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1sb { z1.s }, p1/Z, [x24]\n"
+ "ld1sb { z19.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"ld1sb { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z30.h, z1.h, z25.h\n"
- "add z30.h, z30.h, z17.h\n"
- "ld1sb { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "ld1sb { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z18.h, z16.h, z22.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z5.s }, p1/Z, [x24]\n"
- "trn1 z31.h, z3.h, z5.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
- "add z31.h, z31.h, z17.h\n"
- ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
- ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
- ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
- ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
- ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
- ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
- ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
- ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
- ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
- ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
- ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
- ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
- ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "ld1sb { z4.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z19.h, z19.h, z25.h\n"
+ "ld1sb { z27.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ "add z18.h, z18.h, z15.h\n"
+ "trn1 z20.h, z6.h, z4.h\n"
+ "ld1sb { z22.s }, p1/Z, [x24]\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "add z19.h, z19.h, z15.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "trn1 z21.h, z27.h, z22.h\n"
+ "add z20.h, z20.h, z15.h\n"
+ ".inst 0xc1697648 // sdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc1617649 // sdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "add z21.h, z21.h, z15.h\n"
+ ".inst 0xc16c764a // sdot za.s[x11, 2], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xc164764b // sdot za.s[x11, 3], { z18.h-z19.h }, z4.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d7668 // sdot za.s[x11, 0], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1657669 // sdot za.s[x11, 1], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163764c // sdot za.s[x11, 4], { z18.h-z19.h }, z3.h\n"
+ ".inst 0xc162764d // sdot za.s[x11, 5], { z18.h-z19.h }, z2.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16c766a // sdot za.s[x11, 2], { z19.h-z20.h }, z12.h\n"
+ ".inst 0xc164766b // sdot za.s[x11, 3], { z19.h-z20.h }, z4.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169764e // sdot za.s[x11, 6], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc161764f // sdot za.s[x11, 7], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xc163766c // sdot za.s[x11, 4], { z19.h-z20.h }, z3.h\n"
+ ".inst 0xc162766d // sdot za.s[x11, 5], { z19.h-z20.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c768a // sdot za.s[x11, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768b // sdot za.s[x11, 3], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa1422aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169766e // sdot za.s[x11, 6], { z19.h-z20.h }, z9.h\n"
+ ".inst 0xc161766f // sdot za.s[x11, 7], { z19.h-z20.h }, z1.h\n"
+ ".inst 0xc16c768c // sdot za.s[x11, 4], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768d // sdot za.s[x11, 5], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d768e // sdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc16c768f // sdot za.s[x11, 7], { z20.h-z21.h }, z12.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z26.s }, p1/Z, [x16]\n"
+ "ld1sb { z6.s }, p1/Z, [x16]\n"
"sub x25, x25, #0x1\n"
- "ld1sb { z28.s }, p1/Z, [x20]\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1sb { z31.s }, p1/Z, [x20]\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x25, x15\n"
- "add z25.h, z25.h, z17.h\n"
- "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z31.h, z15.h\n"
"csel x25, x25, x15, LT\n"
- "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z26.h, z26.h, z17.h\n"
+ "trn1 z24.h, z6.h, z13.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1sb { z8.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z22.h, z8.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z21.h, z19.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z20.h, z13.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z27.h, z22.h, z16.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x23, SP, #12\n"
- "ld1sb { z21.s }, p1/Z, [x16]\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "ld1sb { z23.s }, p1/Z, [x16]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1sb { z0.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1sb { z31.s }, p1/Z, [x20]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc169770a // sdot za.s[x11, 2], { z24.h-z25.h }, z9.h\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1sb { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc161770b // sdot za.s[x11, 3], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
"ld1sb { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
- "ld1sb { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16c772a // sdot za.s[x11, 2], { z25.h-z26.h }, z12.h\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc164772b // sdot za.s[x11, 3], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
- "ld1sb { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16d770e // sdot za.s[x11, 6], { z24.h-z25.h }, z13.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc165770f // sdot za.s[x11, 7], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- "trn1 z25.h, z21.h, z0.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
- "add z25.h, z25.h, z17.h\n"
- ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
- "trn1 z26.h, z20.h, z31.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- "add z26.h, z26.h, z17.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
- "trn1 z27.h, z29.h, z22.h\n"
- "trn1 z28.h, z30.h, z6.h\n"
+ ".inst 0xc161774c // sdot za.s[x11, 4], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774d // sdot za.s[x11, 5], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa0422ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc167774e // sdot za.s[x11, 6], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1422aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ "trn1 z24.h, z23.h, z19.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ "trn1 z26.h, z22.h, z18.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z27.h, z27.h, z17.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "add z28.h, z28.h, z17.h\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
@@ -667,513 +672,513 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z9.s }, p0/Z, [x16]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1sb { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1sb { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z9.h, z22.h\n"
- "trn1 z0.h, z21.h, z20.h\n"
+ "add z26.h, p0/M, z26.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z26.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16e76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z14.h\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc16676c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z6.h\n"
+ "add z25.h, p0/M, z25.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1sb { z1.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z25.h\n"
+ "add z1.h, p0/M, z1.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z1.h, z22.h, z20.h\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z2.h, z21.h, z20.h\n"
- ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
- ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
- ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ ".inst 0xc16d76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z13.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16c76e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z25.h, z1.h, z16.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z5.s }, p0/Z, [x16]\n"
- "add z5.h, p0/M, z5.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1sb { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z5.h, z22.h\n"
- "trn1 z29.h, z21.h, z20.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z1.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z14.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1667409 // sdot za.s[x11, 1], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e740a // sdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc166740b // sdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
+ "ld1sb { z0.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z2.h, z18.h, z17.h\n"
+ "add z0.h, p0/M, z0.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "trn1 z30.h, z22.h, z20.h\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #24\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z21.h, z20.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
- ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
- ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
- ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
- ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16c7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z12.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc1647429 // sdot za.s[x11, 1], { z1.h-z2.h }, z4.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0xc16e742a // sdot za.s[x11, 2], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc166742b // sdot za.s[x11, 3], { z1.h-z2.h }, z6.h\n"
+ "trn1 z3.h, z0.h, z17.h\n"
+ ".inst 0xc16d7448 // sdot za.s[x11, 0], { z2.h-z3.h }, z13.h\n"
+ ".inst 0xc1657449 // sdot za.s[x11, 1], { z2.h-z3.h }, z5.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e744a // sdot za.s[x11, 2], { z2.h-z3.h }, z14.h\n"
+ ".inst 0xc166744b // sdot za.s[x11, 3], { z2.h-z3.h }, z6.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z29.s }, p0/Z, [x16]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "ld1sb { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z8.h, z29.h, z22.h\n"
- "trn1 z9.h, z21.h, z20.h\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16376c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z3.h\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc16276c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z2.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ ".inst 0xc16976cc // sdot za.s[x11, 4], { z22.h-z23.h }, z9.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- "trn1 z10.h, z22.h, z20.h\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "trn1 z11.h, z21.h, z20.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
- ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
- ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
- ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
- ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc16176cd // sdot za.s[x11, 5], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16c76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xc16476e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ea // sdot za.s[x11, 2], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16d7708 // sdot za.s[x11, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1657709 // sdot za.s[x11, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa0422aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165770a // sdot za.s[x11, 2], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770b // sdot za.s[x11, 3], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z1.s }, p0/Z, [x16]\n"
- "add z1.h, p0/M, z1.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z21.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z1.h, z22.h\n"
- "trn1 z27.h, z21.h, z20.h\n"
+ "ld1sb { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z22.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z14.h\n"
+ "ld1sb { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16676a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "addvl x23, SP, #6\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16776aa // sdot za.s[x11, 2], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16676ab // sdot za.s[x11, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z23.h, z18.h, z16.h\n"
+ ".inst 0xc16776ac // sdot za.s[x11, 4], { z21.h-z22.h }, z7.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
- "trn1 z28.h, z22.h, z20.h\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
- ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #18\n"
- "trn1 z29.h, z21.h, z20.h\n"
- ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
- "addvl x20, SP, #24\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc16676ad // sdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ "ld1sb { z16.s }, p0/Z, [x24]\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16d76ae // sdot za.s[x11, 6], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc16c76af // sdot za.s[x11, 7], { z21.h-z22.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
- ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16e76cc // sdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cd // sdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16776ce // sdot za.s[x11, 6], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xc16676cf // sdot za.s[x11, 7], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc16176ea // sdot za.s[x11, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16076eb // sdot za.s[x11, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1422a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16c76ee // sdot za.s[x11, 6], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc16476ef // sdot za.s[x11, 7], { z23.h-z24.h }, z4.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z6.s }, p0/Z, [x16]\n"
- "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "ld1sb { z18.s }, p0/Z, [x16]\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z30.s }, p0/Z, [x20]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z17.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z6.h, z30.h\n"
- "trn1 z26.h, z27.h, z26.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z9.s }, p0/Z, [x20]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "sub x25, x25, #0x1\n"
- "sub x15, x15, #0x1\n"
- "cmp x25, x15\n"
- "trn1 z27.h, z8.h, z9.h\n"
- "trn1 z28.h, z21.h, z29.h\n"
- "csel x25, x25, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "sub x15, x15, x25\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z8.s }, p0/Z, [x16]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
- "add x24, x16, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
- "addvl x23, SP, #6\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24]\n"
+ "addvl x23, SP, #12\n"
+ "add x22, x16, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1sb { z29.s }, p0/Z, [x24]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- "mov x12, #0x4\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1sb { z16.s }, p0/Z, [x16]\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc165770b // sdot za.s[x11, 3], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc169772a // sdot za.s[x11, 2], { z25.h-z26.h }, z9.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc161772b // sdot za.s[x11, 3], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "add z19.h, p0/M, z19.h, z15.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc165770e // sdot za.s[x11, 6], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1sb { z30.s }, p0/Z, [x24]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z23.h, p0/M, z23.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16d772e // sdot za.s[x11, 6], { z25.h-z26.h }, z13.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc165772f // sdot za.s[x11, 7], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc167774c // sdot za.s[x11, 4], { z26.h-z27.h }, z7.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x25, x25, #0x1\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1sb { z15.s }, p0/Z, [x24]\n"
- "add z15.h, p0/M, z15.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774e // sdot za.s[x11, 6], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774f // sdot za.s[x11, 7], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- "ld1sb { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ "trn1 z24.h, z16.h, z19.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z25.h, z23.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
- "ld1sb { z31.s }, p0/Z, [x24]\n"
- "add z31.h, p0/M, z31.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
- "ld1sb { z22.s }, p0/Z, [x24]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
- ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
- ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- "trn1 z25.h, z8.h, z21.h\n"
- ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z26.h, z29.h, z30.h\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc1631748 // sdot za.s[x8, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1621749 // sdot za.s[x8, 1], { z26.h-z27.h }, z2.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x2\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z27.h, z15.h, z20.h\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- "trn1 z28.h, z31.h, z22.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ "trn1 z27.h, z17.h, z18.h\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc16e770a // sdot za.s[x11, 2], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d772a // sdot za.s[x11, 2], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc165772b // sdot za.s[x11, 3], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16c770e // sdot za.s[x11, 6], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c774c // sdot za.s[x11, 4], { z26.h-z27.h }, z12.h\n"
+ ".inst 0xc164774d // sdot za.s[x11, 5], { z26.h-z27.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc161774e // sdot za.s[x11, 6], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774f // sdot za.s[x11, 7], { z26.h-z27.h }, z0.h\n"
".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
- ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xc16d1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1651709 // sdot za.s[x8, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc16e1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1661729 // sdot za.s[x8, 1], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xc1611748 // sdot za.s[x8, 0], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc1601749 // sdot za.s[x8, 1], { z26.h-z27.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066818 // mova { z24.d-z25.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc006683a // mova { z26.d-z27.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1abaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1becff8 // sclamp { z24.s-z27.s }, z31.s, z30.s\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z26.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z25.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z27.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1192,6 +1197,8 @@ void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 3da0d14d74..94aa79c747 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,194 +69,199 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0xb\n"
"ptrue p2.b\n"
- "mov x20, #0xb\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x3\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x22, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x5\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x4\n"
+ "sub x21, x21, x4\n"
+ "mov SP, x20\n"
+ "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
"addvl SP, SP, #-15\n"
- "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z7.h, p2/M, z7.h\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "whilelt p1.s, XZR, x6\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z18.h, p2/M, z18.h\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "whilelt p8.s, XZR, x5\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z12.s, #0x0\n"
+ "mov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x7, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z12.h, #0x0\n"
+ "addvl x22, SP, #15\n"
+ "addvl x22, x22, #-3\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z21.d, z20.d\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z13.h, z13.h, z28.h\n"
- "incw x22\n"
- "mov z26.h, #0x0\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "trn1 z17.h, z13.h, z22.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z20.h, z20.h, z28.h\n"
- "addvl x21, SP, #15\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z24.h, z24.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "sub z1.h, z1.h, z28.h\n"
- "trn1 z29.h, z20.h, z1.h\n"
+ "sub z30.h, z30.h, z0.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "sub z17.h, z17.h, z0.h\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "trn1 z16.h, z24.h, z30.h\n"
"ld1sb { z27.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "sub z27.h, z27.h, z28.h\n"
- "incw x22\n"
- "ld1sb { z14.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z14.h, z14.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z22.h, z27.h, z26.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z23.h, z23.h, z28.h\n"
- "st1h { z17.h }, p2, [x21]\n"
- "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z8.h, z17.h\n"
+ "ld1sb { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z30.h, z30.h, z28.h\n"
- "trn1 z8.h, z14.h, z18.h\n"
- "ld1sb { z15.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
- "sub z15.h, z15.h, z28.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z23.h, z23.h, z30.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z28.h\n"
- "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z26.h, z12.h\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "sub z31.h, z31.h, z0.h\n"
+ "incw x23\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "st1h { z15.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z11.h, z27.h, z11.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z15.h, z26.h\n"
- "incw x22\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z13.h, z13.h, z28.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z8.h }, p2, [x21]\n"
- "trn1 z27.h, z20.h, z24.h\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "trn1 z13.h, z31.h, z9.h\n"
+ "ld1sb { z28.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z11.h, z11.h, z28.h\n"
- "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z20.h, z16.h, z13.h\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "trn1 z8.h, z10.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z28.h, z28.h, z0.h\n"
+ "incw x23\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "st1h { z13.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z8.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z13.h, z16.h, z2.h\n"
+ "ld1sb { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z3.h, z3.h, z28.h\n"
- "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z29.h, z11.h, z26.h\n"
+ "trn1 z30.h, z28.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "incw x22\n"
- "sub z13.h, z13.h, z28.h\n"
- "sub z15.h, z15.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z27.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z28.h\n"
- "trn1 z19.h, z22.h, z3.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z31.h, z31.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z31.h, z13.h, z15.h\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "trn1 z17.h, z14.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z13.h }, p2, [x22]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z30.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z4.h, z4.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z31.h, z31.h, z2.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z16.h, z16.h, z26.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z0.h, z0.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z1.h, z1.h, z28.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x21, x21, #-3\n"
- "st1h { z19.h }, p2, [x21]\n"
- "mov z13.d, z12.d\n"
- "mov z14.d, z12.d\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "mov z15.d, z12.d\n"
- "trn1 z8.h, z17.h, z0.h\n"
- "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
- "addvl x21, x21, #-3\n"
- "trn1 z31.h, z18.h, z22.h\n"
- "trn1 z29.h, z1.h, z26.h\n"
- "st1h { z8.h }, p2, [x21]\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "trn1 z24.h, z16.h, z27.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z4.h, z4.h, z12.h\n"
+ "sub z29.h, z29.h, z0.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z31.h }, p2, [x22]\n"
+ "sub z13.h, z13.h, z0.h\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z14.h, z29.h, z10.h\n"
+ "trn1 z10.h, z13.h, z8.h\n"
+ "trn1 z4.h, z11.h, z12.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x21, x7, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z5.s }, p1/Z, [x20, x7, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x7, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x5, x23, LSL #22\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
"mov x22, #0xb\n"
- "add x21, x4, x3\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x5, x4\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x4, x17\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x17, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x5, x16\n"
+ "orr x20, x6, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x17, x4, x20, x17\n"
- ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x23], #0x10\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x5, x21, x16\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "ldp x10, x9, [x23], #0x10\n"
- "ldp x28, x27, [x20], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0040e83 // mova za.d[x8, #3], { z20.d-z23.d }\n"
+ "ldp x11, x10, [x20], #0x10\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
@@ -264,379 +269,379 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- "sub x16, x16, x21\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "st1b { z28.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x4, x3\n"
+ "adds XZR, x5, x4\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
"cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z27.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1sb { z8.s }, p1/Z, [x16]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z0.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z11.s }, p1/Z, [x21]\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z11.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1sb { z29.s }, p1/Z, [x21]\n"
+ "ld1sb { z10.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "trn1 z8.h, z8.h, z26.h\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z31.h\n"
"ld1sb { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z17.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1sb { z31.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z26.s }, p1/Z, [x21]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
+ "add z8.h, z8.h, z18.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z26.h\n"
- "add z31.h, z31.h, z7.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1sb { z20.s }, p1/Z, [x21]\n"
- "mov z0.d, z20.d\n"
- "add z0.h, z0.h, z7.h\n"
- ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "trn1 z11.h, z11.h, z30.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "add z10.h, z10.h, z18.h\n"
+ "trn1 z12.h, z12.h, z28.h\n"
+ "ld1h { z4.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "mov z13.d, z2.d\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xc1701508 // sdot za.s[x8, 0], { z8.h-z11.h }, z0.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1711528 // sdot za.s[x8, 0], { z9.h-z12.h }, z1.h\n"
+ ".inst 0xc1741548 // sdot za.s[x8, 0], { z10.h-z13.h }, z4.h\n"
"9:" // Unpadded: 3 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z29.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p1/Z, [x16]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z17.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z0.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1sb { z31.s }, p1/Z, [x21]\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1sb { z1.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z2.h\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z16.h\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1sb { z16.s }, p1/Z, [x21]\n"
- "mov z2.d, z16.d\n"
- "add z2.h, z2.h, z7.h\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "trn1 z14.h, z14.h, z24.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "trn1 z16.h, z16.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z15.h, z15.h, z18.h\n"
+ "mov z17.d, z17.d\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "add z17.h, z17.h, z18.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17015c8 // sdot za.s[x8, 0], { z14.h-z17.h }, z0.h\n"
"10:" // Unpadded: 2 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z26.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x16]\n"
"addvl x21, SP, #6\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1sb { z27.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "ld1sb { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1sb { z28.s }, p1/Z, [x22]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z29.s }, p1/Z, [x22]\n"
+ "ld1sb { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z29.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1sb { z29.s }, p1/Z, [x22]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "ld1sb { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z19.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1sb { z30.s }, p1/Z, [x22]\n"
+ "trn1 z12.h, z12.h, z26.h\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z23.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z30.h, z23.h\n"
+ "ld1sb { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z30.h, z30.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
- "ld1sb { z22.s }, p1/Z, [x22]\n"
- "mov z31.d, z22.d\n"
- ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
- ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "trn1 z13.h, z13.h, z24.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add z13.h, z13.h, z18.h\n"
+ "trn1 z15.h, z15.h, z24.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z15.h, z15.h, z18.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17115a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z1.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1781589 // sdot za.s[x8, 1], { z12.h-z15.h }, z8.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1sb { z29.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1sb { z9.s }, p1/Z, [x16]\n"
"addvl x21, SP, #3\n"
- "ld1sb { z22.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z22.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1sb { z30.s }, p1/Z, [x22]\n"
+ "ld1sb { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1sb { z25.s }, p1/Z, [x22]\n"
+ "ld1sb { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z25.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1sb { z31.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1sb { z0.s }, p1/Z, [x22]\n"
+ "ld1sb { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z4.h\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
"ld1sb { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1sb { z1.s }, p1/Z, [x22]\n"
+ "ld1sb { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1sb { z2.s }, p1/Z, [x22]\n"
- "trn1 z1.h, z1.h, z2.h\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ "ld1sb { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1sb { z24.s }, p1/Z, [x22]\n"
- "mov z2.d, z24.d\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
- "add z2.h, z2.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
- ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z10.h, z10.h, z18.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "trn1 z13.h, z13.h, z17.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z12.h, z12.h, z18.h\n"
+ "mov z14.d, z16.d\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1701528 // sdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add z14.h, z14.h, z18.h\n"
+ ".inst 0xc1711548 // sdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701529 // sdot za.s[x8, 1], { z9.h-z12.h }, z0.h\n"
+ ".inst 0xc1741568 // sdot za.s[x8, 0], { z11.h-z14.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711549 // sdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
"12:" // Unpadded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
- "add x21, x17, %x[ld_in_row]\n"
- "ld1sb { z23.s }, p1/Z, [x17]\n"
- "sub x7, x7, #0x2\n"
- "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1sb { z11.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x2\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "ld1sb { z24.s }, p1/Z, [x21]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "lsr x20, x7, #0x1\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "lsr x20, x17, #0x1\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z30.h\n"
- "cmp x20, x16\n"
- "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "cmp x20, x15\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "csel x26, x20, x16, LT\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "csel x25, x20, x15, LT\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z22.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1sb { z26.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "and x17, x17, #0x1\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z22.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1sb { z27.s }, p1/Z, [x21]\n"
+ "sub x15, x15, x25\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "and x7, x7, #0x1\n"
- "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z30.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1sb { z28.s }, p1/Z, [x21]\n"
- "mov z28.d, z28.d\n"
- "add z28.h, z28.h, z7.h\n"
- "sub x16, x16, x26\n"
- "cbz x26, 21f\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z30.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
- "addvl x25, SP, #6\n"
- "addvl x24, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
- "add x23, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "add x23, x16, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
- "ld1sb { z23.s }, p1/Z, [x17]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
- "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1sb { z28.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1sb { z24.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ "ld1sb { z29.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- "ld1sb { z18.s }, p1/Z, [x23]\n"
+ "ld1sb { z9.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1sb { z25.s }, p1/Z, [x23]\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1sb { z30.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "ld1sb { z8.s }, p1/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z9.h\n"
+ "ld1sb { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z8.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1sb { z26.s }, p1/Z, [x23]\n"
+ "add z28.h, z28.h, z18.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1sb { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "ld1sb { z28.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1sb { z13.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z28.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1sb { z27.s }, p1/Z, [x23]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ld1sb { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- "ld1sb { z28.s }, p1/Z, [x23]\n"
- "trn1 z27.h, z27.h, z28.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z29.h, z29.h, z18.h\n"
+ "ld1sb { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z27.h, z27.h, z7.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- "ld1sb { z20.s }, p1/Z, [x23]\n"
- "mov z28.d, z20.d\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1sb { z23.s }, p1/Z, [x17]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "st1b { z17.s }, p1, [x14]\n"
+ "trn1 z31.h, z31.h, z13.h\n"
+ "ld1sb { z8.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "ld1h { z12.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z30.h, z30.h, z18.h\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "mov z1.d, z8.d\n"
+ "add z31.h, z31.h, z18.h\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "add z0.h, z0.h, z18.h\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z1.h, z1.h, z18.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ ".inst 0xc17a1788 // sdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xc17b17a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781789 // sdot za.s[x8, 1], { z28.h-z31.h }, z8.h\n"
+ "ld1sb { z11.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17c17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z12.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "ld1sb { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc17917a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z16.h\n"
- "add x9, x9, x27\n"
- "ld1sb { z25.s }, p1/Z, [x20]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17417c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z4.h\n"
+ "ld1sb { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z23.h, z23.h, z7.h\n"
+ "trn1 z12.h, z12.h, z9.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z16.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1sb { z26.s }, p1/Z, [x20]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1sb { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, z25.h, z7.h\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "ld1sb { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1sb { z27.s }, p1/Z, [x20]\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
"ld1sb { z16.s }, p1/Z, [x20]\n"
- "mov z28.d, z16.d\n"
- "add z28.h, z28.h, z7.h\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z14.h, z14.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -644,686 +649,686 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x17]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "trn1 z28.h, z28.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z29.s }, p0/Z, [x21]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z30.s }, p0/Z, [x21]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z31.s }, p0/Z, [x21]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #12\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z18.h\n"
- "trn1 z30.h, z30.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1sb { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- "mov z0.d, z20.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
- "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z25.s }, p0/Z, [x21]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z26.s }, p0/Z, [x21]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z27.s }, p0/Z, [x21]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #9\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "trn1 z27.h, z27.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- "trn1 z28.h, z28.h, z16.h\n"
- ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
- "ld1sb { z11.s }, p0/Z, [x21]\n"
- "add z11.h, p0/M, z11.h, z7.h\n"
- "mov z29.d, z11.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #12\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #6\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
- "ld1sb { z1.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #12\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z1.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #9\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #3\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1sb { z0.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #9\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z0.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "sub x17, x17, #0x2\n"
+ "sub x15, x15, #0x1\n"
+ "lsr x20, x17, #0x1\n"
+ "cmp x20, x15\n"
+ "and x17, x17, #0x1\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "csel x25, x20, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z16.h\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "sub x7, x7, #0x2\n"
- "sub x16, x16, #0x1\n"
- "trn1 z25.h, z25.h, z19.h\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "lsr x20, x7, #0x1\n"
- "cmp x20, x16\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "mov z28.d, z16.d\n"
- "csel x25, x20, x16, LT\n"
- "add x17, x17, %x[ld_in_col]\n"
- "and x7, x7, #0x1\n"
- "sub x16, x16, x25\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x20, SP, #12\n"
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1sb { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc178156a // sdot za.s[x8, 2], { z11.h-z14.h }, z8.h\n"
+ "ld1sb { z25.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ "add z25.h, p0/M, z25.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc179158a // sdot za.s[x8, 2], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "ld1sb { z10.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17115aa // sdot za.s[x8, 2], { z13.h-z16.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ "ld1sb { z26.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z10.h\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "add z26.h, p0/M, z26.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z1.s }, p0/Z, [x20]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1sb { z27.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z27.h, p0/M, z27.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z28.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z28.h, p0/M, z28.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z29.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ "add z29.h, p0/M, z29.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "trn1 z24.h, z24.h, z1.h\n"
- "trn1 z25.h, z25.h, z3.h\n"
- "trn1 z26.h, z26.h, z30.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0xc1741728 // sdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1sb { z15.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p0/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z15.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
- "ld1sb { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17c1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z12.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "mov z30.d, z16.d\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1791749 // sdot za.s[x8, 1], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
- "ld1sb { z24.s }, p0/Z, [x20]\n"
- "mov z28.d, z20.d\n"
- "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z22.s }, p0/Z, [x20]\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1sb { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z31.s }, p0/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "trn1 z23.h, z23.h, z8.h\n"
- "trn1 z24.h, z24.h, z22.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z25.h, z25.h, z28.h\n"
- "trn1 z26.h, z26.h, z20.h\n"
- "st1b { z19.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "trn1 z27.h, z27.h, z31.h\n"
- "mov z28.d, z1.d\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
- "add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1sb { z29.s }, p0/Z, [x17]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1711569 // sdot za.s[x8, 1], { z11.h-z14.h }, z1.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1791589 // sdot za.s[x8, 1], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z8.s }, p0/Z, [x22]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "ld1sb { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1sb { z30.s }, p0/Z, [x22]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "trn1 z11.h, z11.h, z10.h\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1sb { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "ld1sb { z31.s }, p0/Z, [x22]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z25.s }, p0/Z, [x22]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z0.s }, p0/Z, [x22]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z1.s }, p0/Z, [x22]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
+ "ld1sb { z15.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z28.s }, p0/Z, [x22]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "trn1 z30.h, z30.h, z20.h\n"
- "trn1 z31.h, z31.h, z25.h\n"
- "trn1 z0.h, z0.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z1.h, z1.h, z28.h\n"
- ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
- "ld1sb { z22.s }, p0/Z, [x22]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
- ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
- "mov z2.d, z22.d\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
- ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
- ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
- "st1b { z24.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z25.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z26.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z27.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"22:" // Main loop skip tail
- "cbz x7, 23f\n" // Skip remainder inputs
+ "cbz x17, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "sub x15, x15, #0x1\n"
+ "ld1sb { z11.s }, p0/Z, [x16]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1sb { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1sb { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1sb { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1sb { z31.s }, p0/Z, [x20]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z26.h, z26.h, z17.h\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "ld1sb { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- "trn1 z28.h, z28.h, z31.h\n"
- "addvl x21, SP, #6\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- "mov z29.d, z0.d\n"
- "addvl x20, SP, #12\n"
- "sub x16, x16, #0x1\n"
- ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
- ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
"ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"23:" // Tail input: End
- "cbz x16, 25f\n"
+ "cbz x15, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x16, x16, #0x1\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a3ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a5aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+ ".inst 0xc1a7ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc1a6ce68 // sclamp { z8.s-z11.s }, z19.s, z6.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z10.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x7\n"
+ "whilelt p1.s, x7, x6\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x6\n"
- "whilelt p1.s, x6, x5\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1342,9 +1347,11 @@ void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 60c3a1e632..40bfd5850a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,133 +70,138 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x6\n"
"ptrue p2.b\n"
- "mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z20.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-12\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z20.h, p2/M, z20.h\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-12\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z21.h, p2/M, z21.h\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z30.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1b { z10.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z23.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z25.h, #0x0\n"
+ "addvl x22, SP, #12\n"
+ "addvl x22, x22, #-4\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z7.h, #0x0\n"
- "sub z10.h, z10.h, z31.h\n"
- "incw x22\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z16.h, z16.h, z31.h\n"
- "trn1 z20.h, z7.h, z10.h\n"
- "ld1b { z11.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z31.h\n"
- "mov x20, x22\n"
- "trn1 z19.h, z10.h, z16.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z0.h, z0.h, z23.h\n"
+ "sub z26.h, z26.h, z23.h\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "trn1 z14.h, z25.h, z0.h\n"
+ "trn1 z2.h, z0.h, z26.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z26.h, z16.h, z11.h\n"
- "trn1 z13.h, z11.h, z7.h\n"
- "ld1b { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z26.h, z15.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z31.h\n"
- "sub z11.h, z11.h, z31.h\n"
- "ld1b { z2.s }, p2/Z, [x20]\n"
- "sub z2.h, z2.h, z31.h\n"
- "addvl x21, SP, #12\n"
- "incw x22\n"
- "addvl x21, x21, #-4\n"
- "mov x20, x22\n"
- "st1h { z20.h }, p2, [x21]\n"
- "trn1 z22.h, z7.h, z24.h\n"
- "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z24.h, z11.h\n"
- "ld1b { z16.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z25.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z21.h, z21.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z23.h\n"
+ "st1h { z2.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z23.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z3.h, z25.h, z21.h\n"
+ "trn1 z14.h, z21.h, z1.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z3.h, z11.h, z2.h\n"
- "ld1b { z0.s }, p2/Z, [x20]\n"
+ "trn1 z10.h, z1.h, z11.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z25.h, z2.h, z7.h\n"
- "ld1b { z4.s }, p2/Z, [x20]\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z31.h\n"
- "sub z0.h, z0.h, z31.h\n"
- "addvl x21, x21, #-4\n"
- "st1h { z22.h }, p2, [x21]\n"
- "sub z4.h, z4.h, z31.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z30.d\n"
- "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z24.h, z7.h, z16.h\n"
- "trn1 z18.h, z16.h, z0.h\n"
- "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #-4\n"
- "trn1 z0.h, z0.h, z4.h\n"
- "trn1 z1.h, z4.h, z7.h\n"
- "st1h { z24.h }, p2, [x21]\n"
- "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z26.h, z11.h, z25.h\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "st1h { z3.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z16.h, z16.h, z23.h\n"
+ "st1h { z10.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z22.h, z25.h, z15.h\n"
+ "trn1 z6.h, z15.h, z9.h\n"
+ "trn1 z12.h, z9.h, z16.h\n"
+ "trn1 z11.h, z16.h, z25.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z6.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z11.h }, p2, [x22, #3, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z5.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x6\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040b82 // mova za.d[x8, #2], { z28.d-z29.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ ".inst 0xc0040b83 // mova za.d[x8, #3], { z28.d-z29.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
"ldp x27, x26, [x23], #0x10\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +209,22 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +236,148 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x14]\n"
+ "ld1b { z27.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z20.h, z16.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1b { z23.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z5.h, z23.h, z22.h\n"
- "add z5.h, z5.h, z21.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z6.h, z17.h, z16.h\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
- ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
- ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
- ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z27.h, z16.h\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "trn1 z16.h, z3.h, z1.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z15.h, z15.h, z20.h\n"
+ "trn1 z17.h, z12.h, z18.h\n"
+ "add z16.h, z16.h, z20.h\n"
+ "add z17.h, z17.h, z20.h\n"
+ ".inst 0xc16b15e8 // sdot za.s[x8, 0], { z15.h-z16.h }, z11.h\n"
+ ".inst 0xc16a15e9 // sdot za.s[x8, 1], { z15.h-z16.h }, z10.h\n"
+ ".inst 0xc1631608 // sdot za.s[x8, 0], { z16.h-z17.h }, z3.h\n"
+ ".inst 0xc1621609 // sdot za.s[x8, 1], { z16.h-z17.h }, z2.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z25.s }, p1/Z, [x14]\n"
+ "ld1b { z22.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z6.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z3.h, z25.h, z6.h\n"
- "add z3.h, z3.h, z21.h\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z26.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z4.h, z18.h, z26.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1b { z2.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z5.s }, p1/Z, [x22]\n"
- "trn1 z5.h, z2.h, z5.h\n"
- "add z5.h, z5.h, z21.h\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z22.h, z16.h\n"
+ "ld1b { z7.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z19.h, z10.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
- ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
- ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
- ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
- ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
- ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
- ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z21.h, z21.h, z20.h\n"
+ "trn1 z23.h, z11.h, z7.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc16116a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16016a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z0.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16616c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16916ca // sdot za.s[x8, 2], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cb // sdot za.s[x8, 3], { z22.h-z23.h }, z1.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z15.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z9.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z6.h, z6.h, z21.h\n"
- "ld1b { z7.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z7.h, z7.h, z21.h\n"
+ "trn1 z21.h, z15.h, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z1.s }, p1/Z, [x20]\n"
- "trn1 z8.h, z17.h, z1.h\n"
- "add z8.h, z8.h, z21.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"sub x13, x13, x23\n"
+ "trn1 z22.h, z24.h, z9.h\n"
+ "trn1 z23.h, z2.h, z15.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z2.s }, p1/Z, [x14]\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x14]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z4.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "ld1b { z23.s }, p1/Z, [x20]\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z14.h\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- "trn1 z6.h, z2.h, z19.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xc16616ab // sdot za.s[x8, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ "ld1b { z11.s }, p1/Z, [x20]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc16916ac // sdot za.s[x8, 4], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16116ad // sdot za.s[x8, 5], { z21.h-z22.h }, z1.h\n"
+ "trn1 z21.h, z26.h, z4.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16f16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc1a5ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z5.s\n"
+ ".inst 0xc16716cb // sdot za.s[x8, 3], { z22.h-z23.h }, z7.h\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
- "st1b { z24.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
- "trn1 z7.h, z23.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ ".inst 0xc1adaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc16916cc // sdot za.s[x8, 4], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cd // sdot za.s[x8, 5], { z22.h-z23.h }, z1.h\n"
+ "trn1 z22.h, z27.h, z3.h\n"
+ "trn1 z23.h, z25.h, z11.h\n"
"add x8, x8, #0x2\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z26.s }, p1, [x10]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc1bfcfd0 // sclamp { z16.s-z19.s }, z30.s, z31.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z17.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "add z7.h, z7.h, z21.h\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -382,258 +387,258 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z19.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z9.h, z17.h, z16.h\n"
- ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
- ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ ".inst 0xc16c16e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16416e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z4.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16f1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1671709 // sdot za.s[x8, 1], { z24.h-z25.h }, z7.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z16.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z19.h, z18.h\n"
- "trn1 z23.h, z17.h, z16.h\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z17.h, z18.h, z10.h\n"
+ "add z14.h, p0/M, z14.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- "trn1 z24.h, z17.h, z16.h\n"
- ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
- ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
- ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc16f1608 // sdot za.s[x8, 0], { z16.h-z17.h }, z15.h\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
+ ".inst 0xc1671609 // sdot za.s[x8, 1], { z16.h-z17.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
+ ".inst 0xc16f160a // sdot za.s[x8, 2], { z16.h-z17.h }, z15.h\n"
+ ".inst 0xc167160b // sdot za.s[x8, 3], { z16.h-z17.h }, z7.h\n"
+ "trn1 z18.h, z14.h, z10.h\n"
+ ".inst 0xc16c1628 // sdot za.s[x8, 0], { z17.h-z18.h }, z12.h\n"
+ ".inst 0xc1641629 // sdot za.s[x8, 1], { z17.h-z18.h }, z4.h\n"
".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc161162a // sdot za.s[x8, 2], { z17.h-z18.h }, z1.h\n"
+ ".inst 0xc160162b // sdot za.s[x8, 3], { z17.h-z18.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z19.h, z18.h\n"
- "trn1 z7.h, z17.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "sub x13, x13, #0x1\n"
- "cmp x15, x13\n"
- "trn1 z8.h, z17.h, z16.h\n"
- "csel x23, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x23\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z9.s }, p0/Z, [x14]\n"
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z16.s }, p0/Z, [x14]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
"ld1b { z18.s }, p0/Z, [x22]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16f16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16716cd // sdot za.s[x8, 5], { z22.h-z23.h }, z7.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z21.h, z16.h, z17.h\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
- "subs x23, x23, #0x1\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- "ld1b { z2.s }, p0/Z, [x22]\n"
- ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
- "add z2.h, p0/M, z2.h, z21.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z6.h, z9.h, z19.h\n"
- ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "trn1 z7.h, z18.h, z16.h\n"
- "trn1 z8.h, z17.h, z2.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- "st1b { z24.s }, p1, [x11]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc16c16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc16416cd // sdot za.s[x8, 5], { z22.h-z23.h }, z4.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
- ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a5ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1adaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1bfcfd8 // sclamp { z24.s-z27.s }, z30.s, z31.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -652,6 +657,8 @@ void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index e4ce6c74fb..0b17ad3ae9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,119 +70,124 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x9\n"
"ptrue p2.b\n"
- "mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z29.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-6\n"
+ "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z29.h, p2/M, z29.h\n"
+ "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-6\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z11.h, p2/M, z11.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1b { z26.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z22.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z27.h, #0x0\n"
+ "addvl x22, SP, #6\n"
+ "addvl x22, x22, #-2\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z17.d, z16.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z26.h, z26.h, z16.h\n"
- "incw x22\n"
- "mov z24.h, #0x0\n"
- "ld1b { z3.s }, p2/Z, [x20]\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z3.h, z3.h, z16.h\n"
- "trn1 z31.h, z26.h, z3.h\n"
- "ld1b { z21.s }, p2/Z, [x20]\n"
- "sub z21.h, z21.h, z16.h\n"
- "mov x20, x22\n"
- "trn1 z14.h, z21.h, z24.h\n"
- "ld1b { z2.s }, p2/Z, [x20]\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z25.h, z25.h, z22.h\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "sub z9.h, z9.h, z22.h\n"
+ "trn1 z24.h, z25.h, z15.h\n"
+ "ld1b { z12.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z2.h, z2.h, z16.h\n"
- "addvl x21, SP, #6\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
+ "ld1b { z4.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z16.h\n"
- "incw x22\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
- "sub z27.h, z27.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "mov x20, x22\n"
- "st1h { z31.h }, p2, [x21]\n"
- "trn1 z4.h, z2.h, z25.h\n"
- "ld1b { z26.s }, p2/Z, [x20]\n"
+ "trn1 z11.h, z9.h, z27.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z12.h, z12.h, z22.h\n"
+ "sub z4.h, z4.h, z22.h\n"
+ "st1h { z24.h }, p2, [x22]\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "st1h { z11.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z9.h, z12.h, z4.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z12.h, z27.h, z24.h\n"
- "ld1b { z20.s }, p2/Z, [x20]\n"
- "sub z26.h, z26.h, z16.h\n"
- "sub z23.h, z23.h, z16.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z20.h, z20.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "st1h { z4.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
- "addvl x21, x21, #-2\n"
- "mov z30.d, z28.d\n"
- "mov z31.d, z28.d\n"
- "trn1 z25.h, z26.h, z23.h\n"
- "st1h { z25.h }, p2, [x21]\n"
- "trn1 z3.h, z20.h, z24.h\n"
- "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z21.h, z15.h, z27.h\n"
+ "ld1b { z30.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z22.h\n"
+ "sub z10.h, z10.h, z22.h\n"
+ "st1h { z9.h }, p2, [x22]\n"
+ "sub z30.h, z30.h, z22.h\n"
+ "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z15.h, z14.h, z10.h\n"
+ "trn1 z25.h, z30.h, z27.h\n"
+ "st1h { z15.h }, p2, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z1.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x9\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
@@ -191,24 +196,24 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1a8ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1bccfec // sclamp { z12.s-z15.s }, z31.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z12.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z25.s }, p1, [x10]\n"
+ "st1b { z13.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z26.s }, p1, [x27]\n"
+ "st1b { z14.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z15.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +225,194 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
- "ld1b { z2.s }, p1/Z, [x21]\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z15.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z21.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z19.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
- "mov z5.d, z8.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z6.h\n"
+ "ld1b { z10.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z29.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ "mov z27.d, z10.d\n"
+ "add z25.h, z25.h, z29.h\n"
+ "add z26.h, z26.h, z29.h\n"
+ "add z27.h, z27.h, z29.h\n"
+ ".inst 0xc17616e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z6.h\n"
+ ".inst 0xc17e1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z14.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
"ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z12.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "trn1 z20.h, z20.h, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z8.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z5.s }, p1/Z, [x21]\n"
+ "trn1 z21.h, z21.h, z25.h\n"
+ "ld1b { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z5.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1b { z5.s }, p1/Z, [x21]\n"
- "mov z5.d, z5.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add z20.h, z20.h, z29.h\n"
+ ".inst 0xa0402a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z9.h\n"
+ "add z21.h, z21.h, z29.h\n"
+ "mov z24.d, z3.d\n"
+ "add z22.h, z22.h, z29.h\n"
+ "add z23.h, z23.h, z29.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ ".inst 0xc1761688 // sdot za.s[x8, 0], { z20.h-z23.h }, z6.h\n"
+ ".inst 0xc17716a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z7.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
+ "ld1b { z10.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z23.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
"csel x23, x20, x13, LT\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z18.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1b { z24.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z19.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
- "mov z25.d, z8.d\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z24.h\n"
"and x15, x15, #0x1\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"sub x13, x13, x23\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z30.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1771549 // sdot za.s[x8, 1], { z10.h-z13.h }, z7.h\n"
+ "ld1b { z3.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x20, x14, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
+ "ld1b { z9.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z21.h, z21.h, z11.h\n"
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f1569 // sdot za.s[x8, 1], { z11.h-z14.h }, z15.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z8.h\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z3.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ld1b { z27.s }, p1/Z, [x22]\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1b { z24.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a1ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z1.s\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z8.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x22]\n"
- "mov z25.d, z4.d\n"
- "add z25.h, z25.h, z11.h\n"
- ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z12.h\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "trn1 z4.h, z4.h, z15.h\n"
+ "add z3.h, z3.h, z29.h\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z5.h, z5.h, z10.h\n"
+ "ld1b { z21.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a0aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ "trn1 z6.h, z6.h, z14.h\n"
+ "add z4.h, z4.h, z29.h\n"
+ "mov z7.d, z21.d\n"
+ "add z5.h, z5.h, z29.h\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ "add z6.h, z6.h, z29.h\n"
+ "add z7.h, z7.h, z29.h\n"
+ ".inst 0xc1bccff8 // sclamp { z24.s-z27.s }, z31.s, z28.s\n"
+ ".inst 0xc17a1468 // sdot za.s[x8, 0], { z3.h-z6.h }, z10.h\n"
+ "ld1b { z10.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z0.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc17b1488 // sdot za.s[x8, 0], { z4.h-z7.h }, z11.h\n"
+ "ld1b { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z20.h\n"
- "st1b { z1.s }, p1, [x10]\n"
- "ld1b { z23.s }, p1/Z, [x20]\n"
+ "trn1 z10.h, z10.h, z22.h\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z24.h\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z3.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z3.h\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x20]\n"
- "mov z25.d, z3.d\n"
- "add z22.h, z22.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "add z23.h, z23.h, z11.h\n"
- "add z24.h, z24.h, z11.h\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z14.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "trn1 z13.h, z13.h, z6.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -417,440 +422,440 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z20.h, z20.h, z22.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z4.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z23.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
- "addvl x20, SP, #4\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z1.d\n"
- ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
+ ".inst 0xc1731688 // sdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+ "mov z24.d, z24.d\n"
+ ".inst 0xc17b16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z5.s }, p0/Z, [x20]\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z5.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
- "addvl x20, SP, #2\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z15.d\n"
- ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z25.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ ".inst 0xc17316a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "mov z25.d, z20.d\n"
+ ".inst 0xc17b16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
- "mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "trn1 z22.h, z22.h, z3.h\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z23.h, z23.h, z19.h\n"
- "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z25.d, z3.d\n"
- "csel x22, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
- "16:" // Padded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
- "mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "csel x23, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z20.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x21]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z20.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x21]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x21]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z14.h\n"
- "trn1 z22.h, z22.h, z15.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x20, SP, #2\n"
- "ld1b { z2.s }, p0/Z, [x21]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z4.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z21.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x20, x14, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z25.d, z2.d\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z26.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1b { z11.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z26.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z5.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z11.h, z11.h, z9.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z11.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "trn1 z21.h, z21.h, z20.h\n"
- "st1b { z17.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z22.h, z22.h, z4.h\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "st1b { z18.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
- "trn1 z24.h, z24.h, z12.h\n"
- "mov z25.d, z8.d\n"
- "st1b { z19.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- "add x14, x14, %x[ld_in_col]\n"
- "bgt 16b\n"
- "17:" // Main loop tail
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1b { z0.s }, p0/Z, [x14]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "mov z14.d, z9.d\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
+ "ld1b { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z2.s }, p0/Z, [x20]\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z15.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z4.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x22, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "trn1 z0.h, z0.h, z14.h\n"
- "add x8, x8, #0x1\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "trn1 z1.h, z1.h, z12.h\n"
- "trn1 z2.h, z2.h, z21.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z3.h, z3.h, z25.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "mov z4.d, z27.d\n"
- ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1b { z10.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z15.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "trn1 z10.h, z10.h, z15.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z5.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z5.s }, p0/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z15.h\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0xc1721528 // sdot za.s[x8, 0], { z9.h-z12.h }, z2.h\n"
+ "mov z13.d, z5.d\n"
+ ".inst 0xc1731548 // sdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #4\n"
+ "sub x13, x13, #0x1\n"
"ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z15.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z17.h\n"
- "trn1 z22.h, z22.h, z0.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z12.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z30.h, p0/M, z30.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z5.h\n"
- "mov z25.d, z4.d\n"
- "addvl x20, SP, #4\n"
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z6.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "add z6.h, p0/M, z6.h, z29.h\n"
+ ".inst 0xc17216a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "mov z25.d, z6.d\n"
+ ".inst 0xc17316c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17516a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z5.h\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc17d16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z13.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
- "st1b { z0.s }, p1, [x11]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z1.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -869,6 +874,8 @@ void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index d33ef764ef..d4db24071c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,249 +70,254 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "ptrue p2.b\n"
+ "mov x22, SP\n"
"mov x20, #0x8\n"
+ "ptrue p2.b\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x5\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x22, #0x8\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x21, x21, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x20, x5\n"
+ "mov SP, x21\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-30\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "neg z15.h, p2/M, z15.h\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x6\n"
- "addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z17.h, p2/M, z17.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z18.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x23\n"
- "ld1b { z2.s }, p2/Z, [x20]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z13.h, #0x0\n"
+ "addvl x22, SP, #30\n"
+ "addvl x22, x22, #-6\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x23, x24\n"
+ "incw x24\n"
+ "ld1b { z22.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1b { z21.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1b { z19.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1b { z25.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "ld1b { z5.s }, p2/Z, [x23]\n"
+ "mov x20, x24\n"
+ "incw x24\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "trn1 z6.h, z13.h, z22.h\n"
+ "trn1 z23.h, z22.h, z21.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z15.h, #0x0\n"
- "sub z2.h, z2.h, z3.h\n"
- "incw x23\n"
- "ld1b { z13.s }, p2/Z, [x20]\n"
+ "trn1 z4.h, z21.h, z19.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z13.h, z13.h, z3.h\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z19.h, z25.h\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z27.h, z27.h, z3.h\n"
- "trn1 z0.h, z2.h, z13.h\n"
+ "trn1 z22.h, z25.h, z5.h\n"
+ "ld1b { z7.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z25.h, z5.h, z13.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z18.h, z18.h, z0.h\n"
+ "st1h { z6.h }, p2, [x22]\n"
+ "incw x24\n"
+ "sub z7.h, z7.h, z0.h\n"
+ "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z1.h, z1.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z20.h, z13.h, z27.h\n"
+ "trn1 z12.h, z27.h, z9.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z2.h, z9.h, z18.h\n"
"ld1b { z19.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z19.h, z19.h, z3.h\n"
- "trn1 z26.h, z13.h, z27.h\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z9.h, z18.h, z7.h\n"
"ld1b { z14.s }, p2/Z, [x20]\n"
- "sub z14.h, z14.h, z3.h\n"
- "mov x20, x23\n"
- "trn1 z10.h, z27.h, z19.h\n"
- "ld1b { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z19.h, z19.h, z14.h\n"
- "trn1 z1.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z18.h, z7.h, z1.h\n"
"ld1b { z5.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "ld1b { z29.s }, p2/Z, [x20]\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z25.h, z1.h, z13.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "addvl x22, x22, #-6\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "mov x20, x24\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "st1h { z12.h }, p2, [x22, #1, MUL VL]\n"
+ "incw x24\n"
+ "st1h { z2.h }, p2, [x22, #2, MUL VL]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "trn1 z7.h, z13.h, z21.h\n"
+ "trn1 z20.h, z21.h, z19.h\n"
+ "ld1b { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z29.h, z29.h, z3.h\n"
- "addvl x22, SP, #30\n"
+ "trn1 z17.h, z19.h, z14.h\n"
+ "st1h { z9.h }, p2, [x22, #3, MUL VL]\n"
"ld1b { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "incw x23\n"
- "sub z2.h, z2.h, z3.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z23.h, z23.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "trn1 z20.h, z15.h, z9.h\n"
- "incw x23\n"
- "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z22.h, z9.h, z5.h\n"
- "ld1b { z25.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z9.h, z5.h, z29.h\n"
+ "trn1 z12.h, z14.h, z5.h\n"
+ "st1h { z18.h }, p2, [x22, #4, MUL VL]\n"
"ld1b { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z26.h, z29.h, z2.h\n"
- "ld1b { z0.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z28.h, z2.h, z23.h\n"
- "ld1b { z19.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z23.h, z15.h\n"
- "sub z25.h, z25.h, z3.h\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "sub z21.h, z21.h, z3.h\n"
- "ld1b { z6.s }, p2/Z, [x20]\n"
- "sub z0.h, z0.h, z3.h\n"
- "mov x20, x23\n"
- "sub z19.h, z19.h, z3.h\n"
- "sub z6.h, z6.h, z3.h\n"
- "st1h { z20.h }, p2, [x22]\n"
- "incw x23\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z11.h, z15.h, z25.h\n"
- "trn1 z10.h, z25.h, z21.h\n"
- "ld1b { z5.s }, p2/Z, [x20]\n"
+ "trn1 z5.h, z5.h, z16.h\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z14.h, z21.h, z0.h\n"
+ "trn1 z4.h, z16.h, z13.h\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "st1h { z7.h }, p2, [x22]\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "st1h { z20.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z1.h, z13.h, z6.h\n"
+ "trn1 z24.h, z6.h, z2.h\n"
"ld1b { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z21.h, z0.h, z19.h\n"
- "ld1b { z27.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z19.h, z19.h, z6.h\n"
- "ld1b { z29.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z13.h, z6.h, z15.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z23.h, z23.h, z3.h\n"
- "ld1b { z1.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z27.h, z27.h, z3.h\n"
- "sub z29.h, z29.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "sub z1.h, z1.h, z3.h\n"
- "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z30.h, z15.h, z5.h\n"
- "trn1 z26.h, z5.h, z23.h\n"
- "ld1b { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z2.h, z21.h\n"
+ "ld1b { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z22.h, z23.h, z27.h\n"
- "ld1b { z5.s }, p2/Z, [x20]\n"
+ "st1h { z12.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z20.h, z21.h, z25.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z28.h, z27.h, z29.h\n"
- "ld1b { z8.s }, p2/Z, [x20]\n"
+ "st1h { z5.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z17.h, z25.h, z19.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z27.h, z29.h, z1.h\n"
- "ld1b { z9.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z1.h, z15.h\n"
- "ld1b { z14.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z3.h\n"
+ "st1h { z4.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z19.h, z19.h, z13.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z0.h\n"
"addvl x22, x22, #-6\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z8.h, z8.h, z3.h\n"
- "st1h { z30.h }, p2, [x22]\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z14.h, z14.h, z3.h\n"
- "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
- "mov z19.d, z18.d\n"
- "trn1 z22.h, z15.h, z11.h\n"
- "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z1.h, z11.h, z5.h\n"
- "trn1 z31.h, z5.h, z8.h\n"
- "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z8.h, z8.h, z9.h\n"
- "trn1 z21.h, z9.h, z14.h\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z1.h }, p2, [x22]\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z16.h, z13.h, z23.h\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z7.h, z23.h, z6.h\n"
+ "trn1 z12.h, z6.h, z14.h\n"
+ "st1h { z19.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "trn1 z15.h, z14.h, z15.h\n"
- "st1h { z22.h }, p2, [x22]\n"
- "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
- "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
- "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z5.h, z14.h, z22.h\n"
+ "trn1 z14.h, z22.h, z27.h\n"
+ "trn1 z20.h, z27.h, z13.h\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "st1h { z7.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z14.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #5, MUL VL]\n"
"cbz x21, 3f\n"
- "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "ld1w { z8.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x25, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x5\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x6, x5\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x6, x16\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x25, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x6, x16\n"
+ "orr x20, x7, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x16, x6, x20, x16\n"
- ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0046b80 // mova za.d[x11, #0], { z28.d-z29.d }\n"
"mov x22, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x6, x21, x16\n"
+ ".inst 0xc0046b81 // mova za.d[x11, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046b82 // mova za.d[x11, #2], { z28.d-z29.d }\n"
"ldp x14, x13, [x23], #0x10\n"
- ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ ".inst 0xc0046b83 // mova za.d[x11, #3], { z28.d-z29.d }\n"
"ldp x4, x10, [x20], #0x10\n"
- ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ ".inst 0xc0046b84 // mova za.d[x11, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0046b85 // mova za.d[x11, #5], { z28.d-z29.d }\n"
"ldp x9, x28, [x23], #0x10\n"
- ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ ".inst 0xc0046b86 // mova za.d[x11, #6], { z28.d-z29.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
- ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc0046b87 // mova za.d[x11, #7], { z28.d-z29.d }\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066810 // mova { z16.d-z17.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
- ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
- ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
- ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ ".inst 0xc0066832 // mova { z18.d-z19.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc1abaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc1becff0 // sclamp { z16.s-z19.s }, z31.s, z30.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z20.s }, p1, [x14]\n"
+ "st1b { z16.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z22.s }, p1, [x13]\n"
+ "st1b { z18.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z21.s }, p1, [x9]\n"
+ "st1b { z17.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z23.s }, p1, [x28]\n"
+ "st1b { z19.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -328,331 +333,331 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x16]\n"
+ "ld1b { z4.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z1.h, z28.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1b { z1.s }, p1/Z, [x21]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z2.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z1.h, z2.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z6.s }, p1/Z, [x21]\n"
+ "trn1 z22.h, z4.h, z13.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z13.h, z6.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z25.h, z19.h\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z14.h, z27.h\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z30.h, z20.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z17.h\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16d76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16c76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z12.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16e76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x16]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z28.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z20.h, z2.h, z28.h\n"
- "add z20.h, z20.h, z17.h\n"
- "ld1b { z31.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z11.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z21.h, z31.h, z11.h\n"
- "add z21.h, z21.h, z17.h\n"
- "ld1b { z25.s }, p1/Z, [x22]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z25.h, z8.h\n"
- "add z22.h, z22.h, z17.h\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
- "ld1b { z3.s }, p1/Z, [x22]\n"
- "trn1 z23.h, z8.h, z3.h\n"
- ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
- "add z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z21.h, z18.h\n"
+ "ld1b { z7.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z3.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z26.h, z27.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
- ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
- ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
- ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
- ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
- ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc16c770b // sdot za.s[x11, 3], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1697728 // sdot za.s[x11, 0], { z25.h-z26.h }, z9.h\n"
+ ".inst 0xc1617729 // sdot za.s[x11, 1], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc1677748 // sdot za.s[x11, 0], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc1667749 // sdot za.s[x11, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc16c774b // sdot za.s[x11, 3], { z26.h-z27.h }, z12.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z22.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z0.h, z2.h, z22.h\n"
- "add z0.h, z0.h, z17.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "ld1b { z19.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z1.h, z14.h, z6.h\n"
- "add z1.h, z1.h, z17.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "ld1b { z4.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z2.h, z15.h, z6.h\n"
- "add z2.h, z2.h, z17.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ "ld1b { z3.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
- "ld1b { z30.s }, p1/Z, [x23]\n"
- "trn1 z3.h, z21.h, z30.h\n"
- ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
- ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
- "add z3.h, z3.h, z17.h\n"
- ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
- ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
- ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
- ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
- ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
- ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
- ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
- ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z0.h, z19.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z4.h, z3.h\n"
+ "ld1b { z9.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z17.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "trn1 z25.h, z9.h, z17.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16576ca // sdot za.s[x11, 2], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cb // sdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16776e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16576cc // sdot za.s[x11, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cd // sdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xc16776ea // sdot za.s[x11, 2], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16576ec // sdot za.s[x11, 4], { z23.h-z24.h }, z5.h\n"
+ ".inst 0xc16476ed // sdot za.s[x11, 5], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc167770a // sdot za.s[x11, 2], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1422a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d770c // sdot za.s[x11, 4], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc165770d // sdot za.s[x11, 5], { z24.h-z25.h }, z5.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z0.s }, p1/Z, [x16]\n"
+ "ld1b { z16.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z0.h, z3.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1b { z6.s }, p1/Z, [x24]\n"
+ "ld1b { z22.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z30.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z6.h, z30.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1b { z1.s }, p1/Z, [x24]\n"
+ "ld1b { z19.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z30.h, z1.h, z25.h\n"
- "add z30.h, z30.h, z17.h\n"
- "ld1b { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z18.h, z16.h, z22.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z5.s }, p1/Z, [x24]\n"
- "trn1 z31.h, z3.h, z5.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
- "add z31.h, z31.h, z17.h\n"
- ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
- ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
- ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
- ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
- ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
- ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
- ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
- ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
- ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
- ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
- ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
- ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
- ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "ld1b { z4.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z19.h, z19.h, z25.h\n"
+ "ld1b { z27.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ "add z18.h, z18.h, z15.h\n"
+ "trn1 z20.h, z6.h, z4.h\n"
+ "ld1b { z22.s }, p1/Z, [x24]\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "add z19.h, z19.h, z15.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "trn1 z21.h, z27.h, z22.h\n"
+ "add z20.h, z20.h, z15.h\n"
+ ".inst 0xc1697648 // sdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc1617649 // sdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "add z21.h, z21.h, z15.h\n"
+ ".inst 0xc16c764a // sdot za.s[x11, 2], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xc164764b // sdot za.s[x11, 3], { z18.h-z19.h }, z4.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d7668 // sdot za.s[x11, 0], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1657669 // sdot za.s[x11, 1], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163764c // sdot za.s[x11, 4], { z18.h-z19.h }, z3.h\n"
+ ".inst 0xc162764d // sdot za.s[x11, 5], { z18.h-z19.h }, z2.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16c766a // sdot za.s[x11, 2], { z19.h-z20.h }, z12.h\n"
+ ".inst 0xc164766b // sdot za.s[x11, 3], { z19.h-z20.h }, z4.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169764e // sdot za.s[x11, 6], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc161764f // sdot za.s[x11, 7], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xc163766c // sdot za.s[x11, 4], { z19.h-z20.h }, z3.h\n"
+ ".inst 0xc162766d // sdot za.s[x11, 5], { z19.h-z20.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c768a // sdot za.s[x11, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768b // sdot za.s[x11, 3], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa1422aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169766e // sdot za.s[x11, 6], { z19.h-z20.h }, z9.h\n"
+ ".inst 0xc161766f // sdot za.s[x11, 7], { z19.h-z20.h }, z1.h\n"
+ ".inst 0xc16c768c // sdot za.s[x11, 4], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768d // sdot za.s[x11, 5], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d768e // sdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc16c768f // sdot za.s[x11, 7], { z20.h-z21.h }, z12.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x16]\n"
+ "ld1b { z6.s }, p1/Z, [x16]\n"
"sub x25, x25, #0x1\n"
- "ld1b { z28.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z31.s }, p1/Z, [x20]\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x25, x15\n"
- "add z25.h, z25.h, z17.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z31.h, z15.h\n"
"csel x25, x25, x15, LT\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z26.h, z26.h, z17.h\n"
+ "trn1 z24.h, z6.h, z13.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z8.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z22.h, z8.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- "add z28.h, z28.h, z17.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z21.h, z19.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z20.h, z13.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z27.h, z22.h, z16.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x16]\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z23.s }, p1/Z, [x16]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z0.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc169770a // sdot za.s[x11, 2], { z24.h-z25.h }, z9.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1b { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc161770b // sdot za.s[x11, 3], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
"ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
- "ld1b { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16c772a // sdot za.s[x11, 2], { z25.h-z26.h }, z12.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc164772b // sdot za.s[x11, 3], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
- "ld1b { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16d770e // sdot za.s[x11, 6], { z24.h-z25.h }, z13.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc165770f // sdot za.s[x11, 7], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- "trn1 z25.h, z21.h, z0.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
- "add z25.h, z25.h, z17.h\n"
- ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
- "trn1 z26.h, z20.h, z31.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- "add z26.h, z26.h, z17.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
- "trn1 z27.h, z29.h, z22.h\n"
- "trn1 z28.h, z30.h, z6.h\n"
+ ".inst 0xc161774c // sdot za.s[x11, 4], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774d // sdot za.s[x11, 5], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa0422ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc167774e // sdot za.s[x11, 6], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1422aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ "trn1 z24.h, z23.h, z19.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ "trn1 z26.h, z22.h, z18.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z27.h, z27.h, z17.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "add z28.h, z28.h, z17.h\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
@@ -667,513 +672,513 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z9.s }, p0/Z, [x16]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z9.h, z22.h\n"
- "trn1 z0.h, z21.h, z20.h\n"
+ "add z26.h, p0/M, z26.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z26.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16e76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z14.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc16676c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z6.h\n"
+ "add z25.h, p0/M, z25.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z1.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z25.h\n"
+ "add z1.h, p0/M, z1.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z1.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z2.h, z21.h, z20.h\n"
- ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
- ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
- ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ ".inst 0xc16d76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z13.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16c76e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z25.h, z1.h, z16.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z5.s }, p0/Z, [x16]\n"
- "add z5.h, p0/M, z5.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z5.h, z22.h\n"
- "trn1 z29.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z1.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z14.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1667409 // sdot za.s[x11, 1], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e740a // sdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc166740b // sdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z2.h, z18.h, z17.h\n"
+ "add z0.h, p0/M, z0.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "trn1 z30.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #24\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z21.h, z20.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
- ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
- ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
- ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
- ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16c7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z12.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc1647429 // sdot za.s[x11, 1], { z1.h-z2.h }, z4.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0xc16e742a // sdot za.s[x11, 2], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc166742b // sdot za.s[x11, 3], { z1.h-z2.h }, z6.h\n"
+ "trn1 z3.h, z0.h, z17.h\n"
+ ".inst 0xc16d7448 // sdot za.s[x11, 0], { z2.h-z3.h }, z13.h\n"
+ ".inst 0xc1657449 // sdot za.s[x11, 1], { z2.h-z3.h }, z5.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e744a // sdot za.s[x11, 2], { z2.h-z3.h }, z14.h\n"
+ ".inst 0xc166744b // sdot za.s[x11, 3], { z2.h-z3.h }, z6.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x16]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z8.h, z29.h, z22.h\n"
- "trn1 z9.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16376c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z3.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc16276c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z2.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ ".inst 0xc16976cc // sdot za.s[x11, 4], { z22.h-z23.h }, z9.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- "trn1 z10.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "trn1 z11.h, z21.h, z20.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
- ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
- ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
- ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
- ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc16176cd // sdot za.s[x11, 5], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16c76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xc16476e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ea // sdot za.s[x11, 2], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16d7708 // sdot za.s[x11, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1657709 // sdot za.s[x11, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa0422aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165770a // sdot za.s[x11, 2], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770b // sdot za.s[x11, 3], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x16]\n"
- "add z1.h, p0/M, z1.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z21.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z1.h, z22.h\n"
- "trn1 z27.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z22.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z14.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16676a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x23, SP, #6\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16776aa // sdot za.s[x11, 2], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16676ab // sdot za.s[x11, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z23.h, z18.h, z16.h\n"
+ ".inst 0xc16776ac // sdot za.s[x11, 4], { z21.h-z22.h }, z7.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
- "trn1 z28.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
- ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #18\n"
- "trn1 z29.h, z21.h, z20.h\n"
- ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
- "addvl x20, SP, #24\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc16676ad // sdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16d76ae // sdot za.s[x11, 6], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc16c76af // sdot za.s[x11, 7], { z21.h-z22.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
- ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16e76cc // sdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cd // sdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16776ce // sdot za.s[x11, 6], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xc16676cf // sdot za.s[x11, 7], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc16176ea // sdot za.s[x11, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16076eb // sdot za.s[x11, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1422a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16c76ee // sdot za.s[x11, 6], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc16476ef // sdot za.s[x11, 7], { z23.h-z24.h }, z4.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z6.s }, p0/Z, [x16]\n"
- "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z30.s }, p0/Z, [x20]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z6.h, z30.h\n"
- "trn1 z26.h, z27.h, z26.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z9.s }, p0/Z, [x20]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "sub x25, x25, #0x1\n"
- "sub x15, x15, #0x1\n"
- "cmp x25, x15\n"
- "trn1 z27.h, z8.h, z9.h\n"
- "trn1 z28.h, z21.h, z29.h\n"
- "csel x25, x25, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "sub x15, x15, x25\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x16]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
- "add x24, x16, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
- "addvl x23, SP, #6\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24]\n"
+ "addvl x23, SP, #12\n"
+ "add x22, x16, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z29.s }, p0/Z, [x24]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- "mov x12, #0x4\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z16.s }, p0/Z, [x16]\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc165770b // sdot za.s[x11, 3], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc169772a // sdot za.s[x11, 2], { z25.h-z26.h }, z9.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc161772b // sdot za.s[x11, 3], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "add z19.h, p0/M, z19.h, z15.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc165770e // sdot za.s[x11, 6], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1b { z30.s }, p0/Z, [x24]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z23.h, p0/M, z23.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16d772e // sdot za.s[x11, 6], { z25.h-z26.h }, z13.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc165772f // sdot za.s[x11, 7], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc167774c // sdot za.s[x11, 4], { z26.h-z27.h }, z7.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x25, x25, #0x1\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1b { z15.s }, p0/Z, [x24]\n"
- "add z15.h, p0/M, z15.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774e // sdot za.s[x11, 6], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774f // sdot za.s[x11, 7], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ "trn1 z24.h, z16.h, z19.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z25.h, z23.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
- "ld1b { z31.s }, p0/Z, [x24]\n"
- "add z31.h, p0/M, z31.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
- ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
- ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- "trn1 z25.h, z8.h, z21.h\n"
- ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z26.h, z29.h, z30.h\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc1631748 // sdot za.s[x8, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1621749 // sdot za.s[x8, 1], { z26.h-z27.h }, z2.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x2\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z27.h, z15.h, z20.h\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- "trn1 z28.h, z31.h, z22.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ "trn1 z27.h, z17.h, z18.h\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc16e770a // sdot za.s[x11, 2], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d772a // sdot za.s[x11, 2], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc165772b // sdot za.s[x11, 3], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16c770e // sdot za.s[x11, 6], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c774c // sdot za.s[x11, 4], { z26.h-z27.h }, z12.h\n"
+ ".inst 0xc164774d // sdot za.s[x11, 5], { z26.h-z27.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc161774e // sdot za.s[x11, 6], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774f // sdot za.s[x11, 7], { z26.h-z27.h }, z0.h\n"
".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
- ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xc16d1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1651709 // sdot za.s[x8, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc16e1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1661729 // sdot za.s[x8, 1], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xc1611748 // sdot za.s[x8, 0], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc1601749 // sdot za.s[x8, 1], { z26.h-z27.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066818 // mova { z24.d-z25.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc006683a // mova { z26.d-z27.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1abaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1becff8 // sclamp { z24.s-z27.s }, z31.s, z30.s\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z26.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z25.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z27.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1192,6 +1197,8 @@ void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index 6c144afa77..6dbdcc6a84 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,194 +69,199 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0xb\n"
"ptrue p2.b\n"
- "mov x20, #0xb\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x3\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x22, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x5\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x4\n"
+ "sub x21, x21, x4\n"
+ "mov SP, x20\n"
+ "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
"addvl SP, SP, #-15\n"
- "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z7.h, p2/M, z7.h\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "whilelt p1.s, XZR, x6\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z18.h, p2/M, z18.h\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "whilelt p8.s, XZR, x5\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z12.s, #0x0\n"
+ "mov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x7, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1b { z13.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z12.h, #0x0\n"
+ "addvl x22, SP, #15\n"
+ "addvl x22, x22, #-3\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z21.d, z20.d\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z13.h, z13.h, z28.h\n"
- "incw x22\n"
- "mov z26.h, #0x0\n"
- "ld1b { z22.s }, p2/Z, [x20]\n"
+ "ld1b { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "trn1 z17.h, z13.h, z22.h\n"
- "ld1b { z20.s }, p2/Z, [x20]\n"
+ "ld1b { z8.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z20.h, z20.h, z28.h\n"
- "addvl x21, SP, #15\n"
- "ld1b { z1.s }, p2/Z, [x20]\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "sub z24.h, z24.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "sub z1.h, z1.h, z28.h\n"
- "trn1 z29.h, z20.h, z1.h\n"
+ "sub z30.h, z30.h, z0.h\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "sub z17.h, z17.h, z0.h\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "trn1 z16.h, z24.h, z30.h\n"
"ld1b { z27.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "sub z27.h, z27.h, z28.h\n"
- "incw x22\n"
- "ld1b { z14.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z14.h, z14.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "ld1b { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z22.h, z27.h, z26.h\n"
- "ld1b { z23.s }, p2/Z, [x20]\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z23.h, z23.h, z28.h\n"
- "st1h { z17.h }, p2, [x21]\n"
- "ld1b { z30.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z8.h, z17.h\n"
+ "ld1b { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z30.h, z30.h, z28.h\n"
- "trn1 z8.h, z14.h, z18.h\n"
- "ld1b { z15.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
- "sub z15.h, z15.h, z28.h\n"
- "ld1b { z20.s }, p2/Z, [x20]\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z23.h, z23.h, z30.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "ld1b { z24.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z28.h\n"
- "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z26.h, z12.h\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "sub z31.h, z31.h, z0.h\n"
+ "incw x23\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "st1h { z15.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z11.h, z27.h, z11.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z15.h, z26.h\n"
- "incw x22\n"
- "ld1b { z13.s }, p2/Z, [x20]\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z13.h, z13.h, z28.h\n"
- "ld1b { z11.s }, p2/Z, [x20]\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z8.h }, p2, [x21]\n"
- "trn1 z27.h, z20.h, z24.h\n"
- "ld1b { z22.s }, p2/Z, [x20]\n"
+ "trn1 z13.h, z31.h, z9.h\n"
+ "ld1b { z28.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z11.h, z11.h, z28.h\n"
- "ld1b { z3.s }, p2/Z, [x20]\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z20.h, z16.h, z13.h\n"
- "ld1b { z13.s }, p2/Z, [x20]\n"
+ "trn1 z8.h, z10.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z28.h, z28.h, z0.h\n"
+ "incw x23\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "st1h { z13.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z8.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z13.h, z16.h, z2.h\n"
+ "ld1b { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z3.h, z3.h, z28.h\n"
- "ld1b { z15.s }, p2/Z, [x20]\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z29.h, z11.h, z26.h\n"
+ "trn1 z30.h, z28.h, z26.h\n"
"ld1b { z16.s }, p2/Z, [x20]\n"
- "incw x22\n"
- "sub z13.h, z13.h, z28.h\n"
- "sub z15.h, z15.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z27.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z28.h\n"
- "trn1 z19.h, z22.h, z3.h\n"
- "ld1b { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
- "ld1b { z0.s }, p2/Z, [x20]\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "sub z31.h, z31.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z31.h, z13.h, z15.h\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "ld1b { z18.s }, p2/Z, [x20]\n"
+ "trn1 z17.h, z14.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1b { z4.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z13.h }, p2, [x22]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z30.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z4.h, z4.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z31.h, z31.h, z2.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z16.h, z16.h, z26.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "ld1b { z22.s }, p2/Z, [x20]\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z0.h, z0.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "ld1b { z1.s }, p2/Z, [x20]\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z1.h, z1.h, z28.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x21, x21, #-3\n"
- "st1h { z19.h }, p2, [x21]\n"
- "mov z13.d, z12.d\n"
- "mov z14.d, z12.d\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "mov z15.d, z12.d\n"
- "trn1 z8.h, z17.h, z0.h\n"
- "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
- "addvl x21, x21, #-3\n"
- "trn1 z31.h, z18.h, z22.h\n"
- "trn1 z29.h, z1.h, z26.h\n"
- "st1h { z8.h }, p2, [x21]\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "trn1 z24.h, z16.h, z27.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1b { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z4.h, z4.h, z12.h\n"
+ "sub z29.h, z29.h, z0.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z31.h }, p2, [x22]\n"
+ "sub z13.h, z13.h, z0.h\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z14.h, z29.h, z10.h\n"
+ "trn1 z10.h, z13.h, z8.h\n"
+ "trn1 z4.h, z11.h, z12.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x21, x7, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z5.s }, p1/Z, [x20, x7, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x7, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x5, x23, LSL #22\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
"mov x22, #0xb\n"
- "add x21, x4, x3\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x5, x4\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x4, x17\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x17, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x5, x16\n"
+ "orr x20, x6, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x17, x4, x20, x17\n"
- ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x23], #0x10\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x5, x21, x16\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "ldp x10, x9, [x23], #0x10\n"
- "ldp x28, x27, [x20], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0040e83 // mova za.d[x8, #3], { z20.d-z23.d }\n"
+ "ldp x11, x10, [x20], #0x10\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
@@ -264,379 +269,379 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- "sub x16, x16, x21\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "st1b { z28.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x4, x3\n"
+ "adds XZR, x5, x4\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
"cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z27.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x16]\n"
"addvl x20, SP, #12\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z0.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z11.s }, p1/Z, [x21]\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z11.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z10.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "trn1 z8.h, z8.h, z26.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "add z29.h, z29.h, z7.h\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z31.h\n"
"ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z17.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x21]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
+ "add z8.h, z8.h, z18.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z26.h\n"
- "add z31.h, z31.h, z7.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
- "mov z0.d, z20.d\n"
- "add z0.h, z0.h, z7.h\n"
- ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "trn1 z11.h, z11.h, z30.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add z10.h, z10.h, z18.h\n"
+ "trn1 z12.h, z12.h, z28.h\n"
+ "ld1h { z4.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "mov z13.d, z2.d\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xc1701508 // sdot za.s[x8, 0], { z8.h-z11.h }, z0.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1711528 // sdot za.s[x8, 0], { z9.h-z12.h }, z1.h\n"
+ ".inst 0xc1741548 // sdot za.s[x8, 0], { z10.h-z13.h }, z4.h\n"
"9:" // Unpadded: 3 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x16]\n"
"addvl x20, SP, #9\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z17.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z0.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1b { z1.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z2.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z16.h\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z2.d, z16.d\n"
- "add z2.h, z2.h, z7.h\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "trn1 z14.h, z14.h, z24.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "trn1 z16.h, z16.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z15.h, z15.h, z18.h\n"
+ "mov z17.d, z17.d\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "add z17.h, z17.h, z18.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17015c8 // sdot za.s[x8, 0], { z14.h-z17.h }, z0.h\n"
"10:" // Unpadded: 2 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
"addvl x21, SP, #6\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z29.s }, p1/Z, [x22]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z29.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1b { z29.s }, p1/Z, [x22]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z19.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x22]\n"
+ "trn1 z12.h, z12.h, z26.h\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z23.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z30.h, z23.h\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z30.h, z30.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
- "mov z31.d, z22.d\n"
- ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
- ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "trn1 z13.h, z13.h, z24.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add z13.h, z13.h, z18.h\n"
+ "trn1 z15.h, z15.h, z24.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z15.h, z15.h, z18.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17115a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z1.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1781589 // sdot za.s[x8, 1], { z12.h-z15.h }, z8.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z9.s }, p1/Z, [x16]\n"
"addvl x21, SP, #3\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z22.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x22]\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z25.s }, p1/Z, [x22]\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z25.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1b { z0.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z4.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1b { z1.s }, p1/Z, [x22]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x22]\n"
- "trn1 z1.h, z1.h, z2.h\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1b { z24.s }, p1/Z, [x22]\n"
- "mov z2.d, z24.d\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
- "add z2.h, z2.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
- ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z10.h, z10.h, z18.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "trn1 z13.h, z13.h, z17.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z12.h, z12.h, z18.h\n"
+ "mov z14.d, z16.d\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1701528 // sdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add z14.h, z14.h, z18.h\n"
+ ".inst 0xc1711548 // sdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701529 // sdot za.s[x8, 1], { z9.h-z12.h }, z0.h\n"
+ ".inst 0xc1741568 // sdot za.s[x8, 0], { z11.h-z14.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711549 // sdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
"12:" // Unpadded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- "sub x7, x7, #0x2\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x2\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "ld1b { z24.s }, p1/Z, [x21]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "lsr x20, x7, #0x1\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "lsr x20, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z30.h\n"
- "cmp x20, x16\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "cmp x20, x15\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "csel x26, x20, x16, LT\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "csel x25, x20, x15, LT\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z22.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "and x17, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z22.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x21]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "and x7, x7, #0x1\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z30.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
- "mov z28.d, z28.d\n"
- "add z28.h, z28.h, z7.h\n"
- "sub x16, x16, x26\n"
- "cbz x26, 21f\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z30.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
- "addvl x25, SP, #6\n"
- "addvl x24, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
- "add x23, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "add x23, x16, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1b { z28.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1b { z24.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ "ld1b { z29.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ "ld1b { z9.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z25.s }, p1/Z, [x23]\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "ld1b { z8.s }, p1/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z9.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z8.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x23]\n"
+ "add z28.h, z28.h, z18.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "ld1b { z28.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1b { z13.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z28.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x23]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ld1b { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- "ld1b { z28.s }, p1/Z, [x23]\n"
- "trn1 z27.h, z27.h, z28.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z29.h, z29.h, z18.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z27.h, z27.h, z7.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
- "mov z28.d, z20.d\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "st1b { z17.s }, p1, [x14]\n"
+ "trn1 z31.h, z31.h, z13.h\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "ld1h { z12.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z30.h, z30.h, z18.h\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "mov z1.d, z8.d\n"
+ "add z31.h, z31.h, z18.h\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "add z0.h, z0.h, z18.h\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z1.h, z1.h, z18.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ ".inst 0xc17a1788 // sdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xc17b17a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781789 // sdot za.s[x8, 1], { z28.h-z31.h }, z8.h\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17c17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z12.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc17917a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z16.h\n"
- "add x9, x9, x27\n"
- "ld1b { z25.s }, p1/Z, [x20]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17417c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z4.h\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z23.h, z23.h, z7.h\n"
+ "trn1 z12.h, z12.h, z9.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z16.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x20]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, z25.h, z7.h\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x20]\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z28.d, z16.d\n"
- "add z28.h, z28.h, z7.h\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z14.h, z14.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -644,686 +649,686 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x17]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "trn1 z28.h, z28.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x21]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z30.s }, p0/Z, [x21]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z31.s }, p0/Z, [x21]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #12\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z18.h\n"
- "trn1 z30.h, z30.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- "mov z0.d, z20.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
- "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x21]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z26.s }, p0/Z, [x21]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x21]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #9\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "trn1 z27.h, z27.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- "trn1 z28.h, z28.h, z16.h\n"
- ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x21]\n"
- "add z11.h, p0/M, z11.h, z7.h\n"
- "mov z29.d, z11.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #12\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #6\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #12\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z1.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #9\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #3\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #9\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z0.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "sub x17, x17, #0x2\n"
+ "sub x15, x15, #0x1\n"
+ "lsr x20, x17, #0x1\n"
+ "cmp x20, x15\n"
+ "and x17, x17, #0x1\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "csel x25, x20, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "sub x7, x7, #0x2\n"
- "sub x16, x16, #0x1\n"
- "trn1 z25.h, z25.h, z19.h\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "lsr x20, x7, #0x1\n"
- "cmp x20, x16\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "mov z28.d, z16.d\n"
- "csel x25, x20, x16, LT\n"
- "add x17, x17, %x[ld_in_col]\n"
- "and x7, x7, #0x1\n"
- "sub x16, x16, x25\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x20, SP, #12\n"
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc178156a // sdot za.s[x8, 2], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z25.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ "add z25.h, p0/M, z25.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc179158a // sdot za.s[x8, 2], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z10.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17115aa // sdot za.s[x8, 2], { z13.h-z16.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ "ld1b { z26.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z10.h\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "add z26.h, p0/M, z26.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1b { z27.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z27.h, p0/M, z27.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z28.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z28.h, p0/M, z28.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ "add z29.h, p0/M, z29.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "trn1 z24.h, z24.h, z1.h\n"
- "trn1 z25.h, z25.h, z3.h\n"
- "trn1 z26.h, z26.h, z30.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0xc1741728 // sdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1b { z15.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z15.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17c1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z12.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "mov z30.d, z16.d\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1791749 // sdot za.s[x8, 1], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "mov z28.d, z20.d\n"
- "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z31.s }, p0/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "trn1 z23.h, z23.h, z8.h\n"
- "trn1 z24.h, z24.h, z22.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z25.h, z25.h, z28.h\n"
- "trn1 z26.h, z26.h, z20.h\n"
- "st1b { z19.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "trn1 z27.h, z27.h, z31.h\n"
- "mov z28.d, z1.d\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
- "add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1b { z29.s }, p0/Z, [x17]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1711569 // sdot za.s[x8, 1], { z11.h-z14.h }, z1.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1791589 // sdot za.s[x8, 1], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z8.s }, p0/Z, [x22]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1b { z30.s }, p0/Z, [x22]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "trn1 z11.h, z11.h, z10.h\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "ld1b { z31.s }, p0/Z, [x22]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z25.s }, p0/Z, [x22]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z0.s }, p0/Z, [x22]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x22]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z28.s }, p0/Z, [x22]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "trn1 z30.h, z30.h, z20.h\n"
- "trn1 z31.h, z31.h, z25.h\n"
- "trn1 z0.h, z0.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z1.h, z1.h, z28.h\n"
- ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
- "ld1b { z22.s }, p0/Z, [x22]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
- ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
- "mov z2.d, z22.d\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
- ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
- ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
- "st1b { z24.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z25.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z26.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z27.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"22:" // Main loop skip tail
- "cbz x7, 23f\n" // Skip remainder inputs
+ "cbz x17, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z31.s }, p0/Z, [x20]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z26.h, z26.h, z17.h\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- "trn1 z28.h, z28.h, z31.h\n"
- "addvl x21, SP, #6\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- "mov z29.d, z0.d\n"
- "addvl x20, SP, #12\n"
- "sub x16, x16, #0x1\n"
- ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
- ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
"ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"23:" // Tail input: End
- "cbz x16, 25f\n"
+ "cbz x15, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x16, x16, #0x1\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a3ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a5aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+ ".inst 0xc1a7ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc1a6ce68 // sclamp { z8.s-z11.s }, z19.s, z6.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z10.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x7\n"
+ "whilelt p1.s, x7, x6\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x6\n"
- "whilelt p1.s, x6, x5\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1342,9 +1347,11 @@ void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
index 612beb342a..0ed98e15de 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,133 +70,138 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x6\n"
"ptrue p2.b\n"
- "mov x20, #0x6\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z20.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-12\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z20.h, p2/M, z20.h\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-12\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z21.h, p2/M, z21.h\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z30.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z23.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z25.h, #0x0\n"
+ "addvl x22, SP, #12\n"
+ "addvl x22, x22, #-4\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z7.h, #0x0\n"
- "sub z10.h, z10.h, z31.h\n"
- "incw x22\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z16.h, z16.h, z31.h\n"
- "trn1 z20.h, z7.h, z10.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z31.h\n"
- "mov x20, x22\n"
- "trn1 z19.h, z10.h, z16.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z0.h, z0.h, z23.h\n"
+ "sub z26.h, z26.h, z23.h\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "trn1 z14.h, z25.h, z0.h\n"
+ "trn1 z2.h, z0.h, z26.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "trn1 z26.h, z16.h, z11.h\n"
- "trn1 z13.h, z11.h, z7.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z26.h, z15.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z24.h, z24.h, z31.h\n"
- "sub z11.h, z11.h, z31.h\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
- "sub z2.h, z2.h, z31.h\n"
- "addvl x21, SP, #12\n"
- "incw x22\n"
- "addvl x21, x21, #-4\n"
- "mov x20, x22\n"
- "st1h { z20.h }, p2, [x21]\n"
- "trn1 z22.h, z7.h, z24.h\n"
- "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z1.h, z24.h, z11.h\n"
- "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z25.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z21.h, z21.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z23.h\n"
+ "st1h { z2.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z23.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z3.h, z25.h, z21.h\n"
+ "trn1 z14.h, z21.h, z1.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z3.h, z11.h, z2.h\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "trn1 z10.h, z1.h, z11.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
- "trn1 z25.h, z2.h, z7.h\n"
- "ld1sb { z4.s }, p2/Z, [x20]\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z16.h, z16.h, z31.h\n"
- "sub z0.h, z0.h, z31.h\n"
- "addvl x21, x21, #-4\n"
- "st1h { z22.h }, p2, [x21]\n"
- "sub z4.h, z4.h, z31.h\n"
- "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
- "mov z31.d, z30.d\n"
- "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z24.h, z7.h, z16.h\n"
- "trn1 z18.h, z16.h, z0.h\n"
- "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #-4\n"
- "trn1 z0.h, z0.h, z4.h\n"
- "trn1 z1.h, z4.h, z7.h\n"
- "st1h { z24.h }, p2, [x21]\n"
- "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z26.h, z11.h, z25.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z15.h, z15.h, z23.h\n"
+ "st1h { z3.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z23.h\n"
+ "st1h { z14.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z16.h, z16.h, z23.h\n"
+ "st1h { z10.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #-4\n"
+ "trn1 z22.h, z25.h, z15.h\n"
+ "trn1 z6.h, z15.h, z9.h\n"
+ "trn1 z12.h, z9.h, z16.h\n"
+ "trn1 z11.h, z16.h, z25.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z6.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z11.h }, p2, [x22, #3, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z5.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x6\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040b82 // mova za.d[x8, #2], { z28.d-z29.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ ".inst 0xc0040b83 // mova za.d[x8, #3], { z28.d-z29.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
"ldp x27, x26, [x23], #0x10\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
@@ -204,22 +209,22 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
"sub x13, x13, x21\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -231,148 +236,148 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z20.s }, p1/Z, [x14]\n"
+ "ld1b { z27.s }, p1/Z, [x14]\n"
"addvl x20, SP, #8\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z20.h, z16.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1b { z23.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z5.h, z23.h, z22.h\n"
- "add z5.h, z5.h, z21.h\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "trn1 z6.h, z17.h, z16.h\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
- ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
- ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
- ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z27.h, z16.h\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "trn1 z16.h, z3.h, z1.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z15.h, z15.h, z20.h\n"
+ "trn1 z17.h, z12.h, z18.h\n"
+ "add z16.h, z16.h, z20.h\n"
+ "add z17.h, z17.h, z20.h\n"
+ ".inst 0xc16b15e8 // sdot za.s[x8, 0], { z15.h-z16.h }, z11.h\n"
+ ".inst 0xc16a15e9 // sdot za.s[x8, 1], { z15.h-z16.h }, z10.h\n"
+ ".inst 0xc1631608 // sdot za.s[x8, 0], { z16.h-z17.h }, z3.h\n"
+ ".inst 0xc1621609 // sdot za.s[x8, 1], { z16.h-z17.h }, z2.h\n"
"9:" // Unpadded: 1 priming loads
"add x22, x14, %x[ld_in_row]\n"
- "ld1b { z25.s }, p1/Z, [x14]\n"
+ "ld1b { z22.s }, p1/Z, [x14]\n"
"addvl x21, SP, #4\n"
- "ld1b { z6.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z3.h, z25.h, z6.h\n"
- "add z3.h, z3.h, z21.h\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #8\n"
- "ld1b { z26.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z4.h, z18.h, z26.h\n"
- "add z4.h, z4.h, z21.h\n"
- "ld1b { z2.s }, p1/Z, [x22]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z5.s }, p1/Z, [x22]\n"
- "trn1 z5.h, z2.h, z5.h\n"
- "add z5.h, z5.h, z21.h\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z22.h, z16.h\n"
+ "ld1b { z7.s }, p1/Z, [x22]\n"
+ "trn1 z22.h, z19.h, z10.h\n"
".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
- ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
- ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
- ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
- ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
- ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
- ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z21.h, z21.h, z20.h\n"
+ "trn1 z23.h, z11.h, z7.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc16116a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16016a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z0.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16616c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16916ca // sdot za.s[x8, 2], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cb // sdot za.s[x8, 3], { z22.h-z23.h }, z1.h\n"
"10:" // Unpadded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"add x20, x14, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x14]\n"
+ "ld1b { z15.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x1\n"
- "ld1b { z9.s }, p1/Z, [x20]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z17.h, z9.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x15, x13\n"
- "add z6.h, z6.h, z21.h\n"
- "ld1b { z7.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z17.h, z7.h\n"
"csel x23, x15, x13, LT\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ "ld1b { z2.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z7.h, z7.h, z21.h\n"
+ "trn1 z21.h, z15.h, z0.h\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z1.s }, p1/Z, [x20]\n"
- "trn1 z8.h, z17.h, z1.h\n"
- "add z8.h, z8.h, z21.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"sub x13, x13, x23\n"
+ "trn1 z22.h, z24.h, z9.h\n"
+ "trn1 z23.h, z2.h, z15.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x22, SP, #4\n"
"addvl x21, SP, #8\n"
- "ld1b { z2.s }, p1/Z, [x14]\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z26.s }, p1/Z, [x14]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
"add x20, x14, %x[ld_in_row]\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "ld1b { z19.s }, p1/Z, [x20]\n"
+ "ld1b { z4.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- "ld1b { z23.s }, p1/Z, [x20]\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
- "ld1b { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- "ld1b { z17.s }, p1/Z, [x20]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z14.h\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- "trn1 z6.h, z2.h, z19.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xc16616ab // sdot za.s[x8, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ "ld1b { z11.s }, p1/Z, [x20]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc16916ac // sdot za.s[x8, 4], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16116ad // sdot za.s[x8, 5], { z21.h-z22.h }, z1.h\n"
+ "trn1 z21.h, z26.h, z4.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16f16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc1a5ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z5.s\n"
+ ".inst 0xc16716cb // sdot za.s[x8, 3], { z22.h-z23.h }, z7.h\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
- "st1b { z24.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "add z6.h, z6.h, z21.h\n"
- ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
- "trn1 z7.h, z23.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "add z21.h, z21.h, z20.h\n"
+ ".inst 0xc1adaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc16916cc // sdot za.s[x8, 4], { z22.h-z23.h }, z9.h\n"
+ ".inst 0xc16116cd // sdot za.s[x8, 5], { z22.h-z23.h }, z1.h\n"
+ "trn1 z22.h, z27.h, z3.h\n"
+ "trn1 z23.h, z25.h, z11.h\n"
"add x8, x8, #0x2\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z26.s }, p1, [x10]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "add z22.h, z22.h, z20.h\n"
+ "add z23.h, z23.h, z20.h\n"
+ ".inst 0xc1bfcfd0 // sclamp { z16.s-z19.s }, z30.s, z31.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z18.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z17.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "add z7.h, z7.h, z21.h\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z19.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "add z8.h, z8.h, z21.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -382,258 +387,258 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z7.h, z19.h, z18.h\n"
- "trn1 z8.h, z17.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #8\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z9.h, z17.h, z16.h\n"
- ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
- ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ ".inst 0xc16c16e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16416e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z4.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16f1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1671709 // sdot za.s[x8, 1], { z24.h-z25.h }, z7.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z16.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z19.h, z18.h\n"
- "trn1 z23.h, z17.h, z16.h\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z17.h, z18.h, z10.h\n"
+ "add z14.h, p0/M, z14.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- "trn1 z24.h, z17.h, z16.h\n"
- ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
- ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
- ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
- ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc16f1608 // sdot za.s[x8, 0], { z16.h-z17.h }, z15.h\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
+ ".inst 0xc1671609 // sdot za.s[x8, 1], { z16.h-z17.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ "add z10.h, p0/M, z10.h, z20.h\n"
+ ".inst 0xc16f160a // sdot za.s[x8, 2], { z16.h-z17.h }, z15.h\n"
+ ".inst 0xc167160b // sdot za.s[x8, 3], { z16.h-z17.h }, z7.h\n"
+ "trn1 z18.h, z14.h, z10.h\n"
+ ".inst 0xc16c1628 // sdot za.s[x8, 0], { z17.h-z18.h }, z12.h\n"
+ ".inst 0xc1641629 // sdot za.s[x8, 1], { z17.h-z18.h }, z4.h\n"
".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
- ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xc161162a // sdot za.s[x8, 2], { z17.h-z18.h }, z1.h\n"
+ ".inst 0xc160162b // sdot za.s[x8, 3], { z17.h-z18.h }, z0.h\n"
"15:" // Padded: 0 priming loads
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
"cbz x15, 18f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z19.s }, p0/Z, [x14]\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
"add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "ld1b { z17.s }, p0/Z, [x14]\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z6.h, z19.h, z18.h\n"
- "trn1 z7.h, z17.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
"ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
"ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- "sub x15, x15, #0x1\n"
- "sub x13, x13, #0x1\n"
- "cmp x15, x13\n"
- "trn1 z8.h, z17.h, z16.h\n"
- "csel x23, x15, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
- "sub x13, x13, x23\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"cbz x23, 17f\n"
"16:" // Padded: Main loop
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z9.s }, p0/Z, [x14]\n"
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- "add z9.h, p0/M, z9.h, z21.h\n"
"add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ "addvl x21, SP, #4\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z16.s }, p0/Z, [x14]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x22]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- "add z19.h, p0/M, z19.h, z21.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
"ld1b { z18.s }, p0/Z, [x22]\n"
- "add z18.h, p0/M, z18.h, z21.h\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16f16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16716cd // sdot za.s[x8, 5], { z22.h-z23.h }, z7.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z21.h, z16.h, z17.h\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ "add z18.h, p0/M, z18.h, z20.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x22]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"mov x12, #0x4\n"
- "addvl x21, SP, #4\n"
- "add z16.h, p0/M, z16.h, z21.h\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #8\n"
- ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
- "subs x23, x23, #0x1\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
"ld1b { z17.s }, p0/Z, [x22]\n"
- ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z21.h\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- "ld1b { z2.s }, p0/Z, [x22]\n"
- ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
- "add z2.h, p0/M, z2.h, z21.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
- ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "add z17.h, p0/M, z17.h, z20.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "trn1 z6.h, z9.h, z19.h\n"
- ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- "trn1 z7.h, z18.h, z16.h\n"
- "trn1 z8.h, z17.h, z2.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z20.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
"bgt 16b\n"
"17:" // Main loop tail
- ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16c16a8 // sdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
"addvl x21, SP, #4\n"
"addvl x20, SP, #8\n"
- ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
- ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
- ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
- ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
- ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
- ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
- ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
- ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
- ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
- "st1b { z24.s }, p1, [x11]\n"
+ ".inst 0xc16416a9 // sdot za.s[x8, 1], { z21.h-z22.h }, z4.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b16c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a16c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f16aa // sdot za.s[x8, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ab // sdot za.s[x8, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060800 // mova { z0.d-z1.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060822 // mova { z2.d-z3.d }, za.d[x8, #1]\n"
+ ".inst 0xc16f16ac // sdot za.s[x8, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc16716ad // sdot za.s[x8, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16e16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc16616cb // sdot za.s[x8, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1412a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1adaa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc16c16cc // sdot za.s[x8, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc16416cd // sdot za.s[x8, 5], { z22.h-z23.h }, z4.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ ".inst 0xc1bfcfc0 // sclamp { z0.s-z3.s }, z30.s, z31.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
- "st1b { z26.s }, p1, [x10]\n"
+ "st1b { z2.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
- "st1b { z25.s }, p1, [x27]\n"
+ "st1b { z1.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
- ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z3.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
- ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
- "add x8, x8, #0x2\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
"18:" // Main loop skip tail
"cbz x13, 20f\n"
"19:" // Right padding loop
- ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
- ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
"add x8, x8, #0x2\n"
- ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
- ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
- ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
- ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
- ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
- "st1b { z4.s }, p1, [x11]\n"
+ ".inst 0xc0040b84 // mova za.d[x8, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0040b85 // mova za.d[x8, #5], { z28.d-z29.d }\n"
+ ".inst 0xc1a5ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1adaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1bfcfd8 // sclamp { z24.s-z27.s }, z30.s, z31.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z6.s }, p1, [x10]\n"
+ "st1b { z26.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z5.s }, p1, [x27]\n"
+ "st1b { z25.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z7.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 19b\n"
"20:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -652,6 +657,8 @@ void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #12\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
index 8ce04fb8c2..1de49f698b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,119 +70,124 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0x9\n"
"ptrue p2.b\n"
- "mov x20, #0x9\n"
"ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x6\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x22, #0x8\n"
"ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z29.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x21, x6\n"
+ "mov SP, x20\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-6\n"
+ "ld1rw { z1.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x17\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z29.h, p2/M, z29.h\n"
+ "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x7\n"
- "addvl SP, SP, #-6\n"
- "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z11.h, p2/M, z11.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z28.s, #0x0\n"
+ "mov z16.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x16, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z22.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z27.h, #0x0\n"
+ "addvl x22, SP, #6\n"
+ "addvl x22, x22, #-2\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z17.d, z16.d\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z26.h, z26.h, z16.h\n"
- "incw x22\n"
- "mov z24.h, #0x0\n"
- "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z3.h, z3.h, z16.h\n"
- "trn1 z31.h, z26.h, z3.h\n"
- "ld1sb { z21.s }, p2/Z, [x20]\n"
- "sub z21.h, z21.h, z16.h\n"
- "mov x20, x22\n"
- "trn1 z14.h, z21.h, z24.h\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z25.h, z25.h, z22.h\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "sub z9.h, z9.h, z22.h\n"
+ "trn1 z24.h, z25.h, z15.h\n"
+ "ld1sb { z12.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z2.h, z2.h, z16.h\n"
- "addvl x21, SP, #6\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "sub z25.h, z25.h, z16.h\n"
- "incw x22\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
- "sub z27.h, z27.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "mov x20, x22\n"
- "st1h { z31.h }, p2, [x21]\n"
- "trn1 z4.h, z2.h, z25.h\n"
- "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "trn1 z11.h, z9.h, z27.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "sub z12.h, z12.h, z22.h\n"
+ "sub z4.h, z4.h, z22.h\n"
+ "st1h { z24.h }, p2, [x22]\n"
+ "sub z15.h, z15.h, z22.h\n"
+ "st1h { z11.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z9.h, z12.h, z4.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #3\n"
- "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z12.h, z27.h, z24.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
- "sub z26.h, z26.h, z16.h\n"
- "sub z23.h, z23.h, z16.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "sub z20.h, z20.h, z16.h\n"
- "addvl x21, x21, #-2\n"
- "st1h { z4.h }, p2, [x21]\n"
- "mov z29.d, z28.d\n"
- "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
- "addvl x21, x21, #-2\n"
- "mov z30.d, z28.d\n"
- "mov z31.d, z28.d\n"
- "trn1 z25.h, z26.h, z23.h\n"
- "st1h { z25.h }, p2, [x21]\n"
- "trn1 z3.h, z20.h, z24.h\n"
- "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "trn1 z21.h, z15.h, z27.h\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z22.h\n"
+ "sub z10.h, z10.h, z22.h\n"
+ "st1h { z9.h }, p2, [x22]\n"
+ "sub z30.h, z30.h, z22.h\n"
+ "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #-2\n"
+ "trn1 z15.h, z14.h, z10.h\n"
+ "trn1 z25.h, z30.h, z27.h\n"
+ "st1h { z15.h }, p2, [x22]\n"
+ "st1h { z25.h }, p2, [x22, #1, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z1.s }, p1/Z, [x21, x16, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z0.s }, p1/Z, [x20, x16, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x15, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x17, x23, LSL #22\n"
"mov x22, #0x9\n"
- "add x21, x7, x6\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x7, x6\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x7, x14\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x15, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x7, x14\n"
+ "orr x20, x17, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x14, x7, x20, x14\n"
- ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
"mov x22, #0x2\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x14, x7, x21, x14\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
"ldp x11, x10, [x23], #0x10\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
"ldp x9, x28, [x20], #0x10\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
"ldp x27, x26, [x23], #0x10\n"
"ldp x25, x24, [x20], #0x10\n"
"cbz x21, 7f\n"
@@ -191,24 +196,24 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
"sub x13, x13, x21\n"
- ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ ".inst 0xc1a1ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1a8ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ ".inst 0xc1bccfec // sclamp { z12.s-z15.s }, z31.s, z28.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z24.s }, p1, [x11]\n"
+ "st1b { z12.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z25.s }, p1, [x10]\n"
+ "st1b { z13.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z26.s }, p1, [x27]\n"
+ "st1b { z14.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z27.s }, p1, [x26]\n"
+ "st1b { z15.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -220,194 +225,194 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 2 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x14]\n"
+ "ld1b { z23.s }, p1/Z, [x14]\n"
"addvl x20, SP, #4\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
- "ld1b { z2.s }, p1/Z, [x21]\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z15.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z15.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x21]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z21.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z19.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
- "mov z5.d, z8.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z6.h\n"
+ "ld1b { z10.s }, p1/Z, [x21]\n"
+ "add z23.h, z23.h, z29.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ "mov z27.d, z10.d\n"
+ "add z25.h, z25.h, z29.h\n"
+ "add z26.h, z26.h, z29.h\n"
+ "add z27.h, z27.h, z29.h\n"
+ ".inst 0xc17616e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z6.h\n"
+ ".inst 0xc17e1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z14.h\n"
"9:" // Unpadded: 1 priming loads
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x14]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
"addvl x20, SP, #2\n"
- "ld1b { z21.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z21.h\n"
- "add z1.h, z1.h, z11.h\n"
"ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z12.s }, p1/Z, [x21]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z2.h, z2.h, z12.h\n"
- "add z2.h, z2.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "trn1 z20.h, z20.h, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z3.h, z3.h, z8.h\n"
- "add z3.h, z3.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x21]\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z5.s }, p1/Z, [x21]\n"
+ "trn1 z21.h, z21.h, z25.h\n"
+ "ld1b { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z4.h, z4.h, z5.h\n"
- "add z4.h, z4.h, z11.h\n"
- "ld1b { z5.s }, p1/Z, [x21]\n"
- "mov z5.d, z5.d\n"
- "add z5.h, z5.h, z11.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
- ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add z20.h, z20.h, z29.h\n"
+ ".inst 0xa0402a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20]\n"
+ "trn1 z23.h, z23.h, z9.h\n"
+ "add z21.h, z21.h, z29.h\n"
+ "mov z24.d, z3.d\n"
+ "add z22.h, z22.h, z29.h\n"
+ "add z23.h, z23.h, z29.h\n"
+ "add z24.h, z24.h, z29.h\n"
+ ".inst 0xc1761688 // sdot za.s[x8, 0], { z20.h-z23.h }, z6.h\n"
+ ".inst 0xc17716a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z7.h\n"
"10:" // Unpadded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"add x21, x14, %x[ld_in_row]\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
+ "ld1b { z10.s }, p1/Z, [x14]\n"
"sub x15, x15, #0x2\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z8.h\n"
"sub x13, x13, #0x1\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"lsr x20, x15, #0x1\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z25.h\n"
"cmp x20, x13\n"
- "ld1b { z23.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
"csel x23, x20, x13, LT\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1b { z18.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z18.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1b { z24.s }, p1/Z, [x21]\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x14, x14, %x[ld_in_col]\n"
- "ld1b { z19.s }, p1/Z, [x21]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z19.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
- "mov z25.d, z8.d\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z24.h\n"
"and x15, x15, #0x1\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
"sub x13, x13, x23\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z30.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"cbz x23, 17f\n"
"11:" // Unpadded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
"addvl x20, SP, #4\n"
"add x22, x14, %x[ld_in_row]\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
"addvl x21, SP, #2\n"
"subs x23, x23, #0x1\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1771549 // sdot za.s[x8, 1], { z10.h-z13.h }, z7.h\n"
+ "ld1b { z3.s }, p1/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "add x20, x14, %x[ld_in_row]\n"
- "ld1b { z18.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
+ "ld1b { z9.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z21.h, z21.h, z11.h\n"
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f1569 // sdot za.s[x8, 1], { z11.h-z14.h }, z15.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z8.h\n"
- "add z22.h, z22.h, z11.h\n"
- "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z3.h, z3.h, z9.h\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "ld1b { z27.s }, p1/Z, [x22]\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "add z23.h, z23.h, z11.h\n"
- "ld1b { z24.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a1ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z1.s\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z8.h\n"
- "add z24.h, z24.h, z11.h\n"
- "ld1b { z4.s }, p1/Z, [x22]\n"
- "mov z25.d, z4.d\n"
- "add z25.h, z25.h, z11.h\n"
- ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
- "ld1b { z21.s }, p1/Z, [x14]\n"
- ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- "ld1b { z12.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z12.h\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "trn1 z4.h, z4.h, z15.h\n"
+ "add z3.h, z3.h, z29.h\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z5.h, z5.h, z10.h\n"
+ "ld1b { z21.s }, p1/Z, [x22]\n"
+ ".inst 0xc1a0aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0402aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21]\n"
+ "trn1 z6.h, z6.h, z14.h\n"
+ "add z4.h, z4.h, z29.h\n"
+ "mov z7.d, z21.d\n"
+ "add z5.h, z5.h, z29.h\n"
+ ".inst 0xc1a8ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ "add z6.h, z6.h, z29.h\n"
+ "add z7.h, z7.h, z29.h\n"
+ ".inst 0xc1bccff8 // sclamp { z24.s-z27.s }, z31.s, z28.s\n"
+ ".inst 0xc17a1468 // sdot za.s[x8, 0], { z3.h-z6.h }, z10.h\n"
+ "ld1b { z10.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
"ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z0.s }, p1, [x11]\n"
+ "st1b { z24.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ ".inst 0xc17b1488 // sdot za.s[x8, 0], { z4.h-z7.h }, z11.h\n"
+ "ld1b { z11.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z20.h\n"
- "st1b { z1.s }, p1, [x10]\n"
- "ld1b { z23.s }, p1/Z, [x20]\n"
+ "trn1 z10.h, z10.h, z22.h\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "st1b { z25.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z24.h\n"
+ "st1b { z26.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z27.s }, p1, [x26]\n"
"add x26, x26, x24\n"
- "ld1b { z3.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z3.h\n"
- "add z21.h, z21.h, z11.h\n"
- "ld1b { z3.s }, p1/Z, [x20]\n"
- "mov z25.d, z3.d\n"
- "add z22.h, z22.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "add z23.h, z23.h, z11.h\n"
- "add z24.h, z24.h, z11.h\n"
- "add z25.h, z25.h, z11.h\n"
+ "trn1 z11.h, z11.h, z14.h\n"
+ "add z10.h, z10.h, z29.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "trn1 z13.h, z13.h, z6.h\n"
+ "add z11.h, z11.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "add z12.h, z12.h, z29.h\n"
+ "add z13.h, z13.h, z29.h\n"
+ "add z14.h, z14.h, z29.h\n"
"bgt 11b\n"
"b 17f\n"
"12:" // Padded
@@ -417,440 +422,440 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"beq 14f\n"
"13:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z20.h, z20.h, z22.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z4.h\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z23.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z24.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
- "addvl x20, SP, #4\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z1.d\n"
- ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
+ ".inst 0xc1731688 // sdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+ "mov z24.d, z24.d\n"
+ ".inst 0xc17b16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z11.h\n"
"14:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x14]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z5.s }, p0/Z, [x20]\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z22.h, z22.h, z17.h\n"
- "trn1 z23.h, z23.h, z5.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z15.s }, p0/Z, [x20]\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
- "addvl x20, SP, #2\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "trn1 z25.h, z25.h, z17.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "mov z26.d, z15.d\n"
- ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z25.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ ".inst 0xc17316a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "mov z25.d, z20.d\n"
+ ".inst 0xc17b16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
"15:" // Padded: 0 priming loads
"cmp x15, #0x2\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"blt 18f\n"
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
- "mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z18.h\n"
- "trn1 z22.h, z22.h, z3.h\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
"sub x15, x15, #0x2\n"
"sub x13, x13, #0x1\n"
- "trn1 z23.h, z23.h, z19.h\n"
- "trn1 z24.h, z24.h, z20.h\n"
"lsr x20, x15, #0x1\n"
"cmp x20, x13\n"
- "mov z25.d, z3.d\n"
- "csel x22, x20, x13, LT\n"
- "add x14, x14, %x[ld_in_col]\n"
"and x15, x15, #0x1\n"
- "sub x13, x13, x22\n"
- "cbz x22, 17f\n"
- "16:" // Padded: Main loop
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
- "mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x21, x14, %x[ld_in_row]\n"
- ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "csel x23, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p0/Z, [x21]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z20.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z15.s }, p0/Z, [x21]\n"
"mov x12, #0x4\n"
- "add z15.h, p0/M, z15.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x21]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z20.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x21]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z20.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x21]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
"mov x12, #0x8\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z14.h\n"
- "trn1 z22.h, z22.h, z15.h\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "addvl x20, SP, #2\n"
- "ld1b { z2.s }, p0/Z, [x21]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z4.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z21.h\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
+ "mov z14.d, z20.d\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
- "add x14, x14, %x[ld_in_col]\n"
- ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x20, x14, %x[ld_in_row]\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "mov z25.d, z2.d\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z26.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1b { z11.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z26.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z5.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z11.h, z11.h, z9.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z9.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "add x20, x20, %x[ld_in_row]\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z11.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "subs x22, x22, #0x1\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
- "add x11, x11, x9\n"
- "trn1 z21.h, z21.h, z20.h\n"
- "st1b { z17.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z22.h, z22.h, z4.h\n"
- "trn1 z23.h, z23.h, z27.h\n"
- "st1b { z18.s }, p1, [x27]\n"
- "add x27, x27, x25\n"
- "trn1 z24.h, z24.h, z12.h\n"
- "mov z25.d, z8.d\n"
- "st1b { z19.s }, p1, [x26]\n"
- "add x26, x26, x24\n"
- "add x14, x14, %x[ld_in_col]\n"
- "bgt 16b\n"
- "17:" // Main loop tail
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "addvl x20, SP, #4\n"
"mov x12, #0x0\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z9.s }, p0/Z, [x22]\n"
+ "trn1 z13.h, z13.h, z20.h\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "add x20, x14, %x[ld_in_row]\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- "ld1b { z0.s }, p0/Z, [x14]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z10.s }, p0/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "mov z14.d, z9.d\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z14.s }, p0/Z, [x20]\n"
- "add z14.h, p0/M, z14.h, z11.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z11.h\n"
+ "ld1b { z11.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z25.h\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z12.s }, p0/Z, [x20]\n"
"mov x12, #0x4\n"
- "add z12.h, p0/M, z12.h, z11.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z2.s }, p0/Z, [x20]\n"
- "add z2.h, p0/M, z2.h, z11.h\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z15.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add z3.h, p0/M, z3.h, z11.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z4.h\n"
+ "add z13.h, p0/M, z13.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, p0/M, z25.h, z11.h\n"
+ "add z4.h, p0/M, z4.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z26.h, p0/M, z26.h, z29.h\n"
+ "mov z14.d, z26.d\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc1721548 // sdot za.s[x8, 0], { z10.h-z13.h }, z2.h\n"
+ "addvl x22, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
"addvl x20, SP, #2\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- "trn1 z0.h, z0.h, z14.h\n"
- "add x8, x8, #0x1\n"
- "add z27.h, p0/M, z27.h, z11.h\n"
- "trn1 z1.h, z1.h, z12.h\n"
- "trn1 z2.h, z2.h, z21.h\n"
+ ".inst 0xc1731568 // sdot za.s[x8, 0], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1721549 // sdot za.s[x8, 1], { z10.h-z13.h }, z2.h\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
"add x14, x14, %x[ld_in_col]\n"
- "trn1 z3.h, z3.h, z25.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- "mov z4.d, z27.d\n"
- ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add z9.h, p0/M, z9.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1731569 // sdot za.s[x8, 1], { z11.h-z14.h }, z3.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ "ld1b { z10.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z15.h\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ "add z10.h, p0/M, z10.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
- ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "trn1 z10.h, z10.h, z15.h\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
+ "add z11.h, p0/M, z11.h, z29.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z5.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z5.h\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z5.s }, p0/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z15.h\n"
+ "add z5.h, p0/M, z5.h, z29.h\n"
+ ".inst 0xc1721528 // sdot za.s[x8, 0], { z9.h-z12.h }, z2.h\n"
+ "mov z13.d, z5.d\n"
+ ".inst 0xc1731548 // sdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+ ".inst 0xa0402be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP]\n"
"18:" // Main loop skip tail
"cbz x15, 19f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x21, x14, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #4\n"
+ "sub x13, x13, #0x1\n"
"ld1b { z21.s }, p0/Z, [x14]\n"
- "add z21.h, p0/M, z21.h, z11.h\n"
- "add x20, x14, %x[ld_in_row]\n"
+ "add z21.h, p0/M, z21.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z15.h\n"
+ "add z22.h, p0/M, z22.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z11.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z21.h, z21.h, z17.h\n"
- "trn1 z22.h, z22.h, z0.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x20]\n"
- "add z23.h, p0/M, z23.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z12.h\n"
+ "add z23.h, p0/M, z23.h, z29.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z20.h, p0/M, z20.h, z29.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z20.h\n"
+ "add z24.h, p0/M, z24.h, z29.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z5.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z5.h, p0/M, z5.h, z11.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z30.h, p0/M, z30.h, z29.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z4.s }, p0/Z, [x20]\n"
- "add z4.h, p0/M, z4.h, z11.h\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z5.h\n"
- "mov z25.d, z4.d\n"
- "addvl x20, SP, #4\n"
- ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
- "sub x13, x13, #0x1\n"
- ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
- ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
- ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
- ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z6.s }, p0/Z, [x21]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "add z6.h, p0/M, z6.h, z29.h\n"
+ ".inst 0xc17216a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "mov z25.d, z6.d\n"
+ ".inst 0xc17316c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17516a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z5.h\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc17d16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z13.h\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
- "st1b { z16.s }, p1, [x11]\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- "st1b { z17.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z18.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z19.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"19:" // Tail input: End
"cbz x13, 21f\n"
"20:" // Right padding loop
- ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
"subs x13, x13, #0x1\n"
- ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
- ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
- ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
- "st1b { z0.s }, p1, [x11]\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ ".inst 0xc1a1ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z1.s\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a8ab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z8.s\n"
+ ".inst 0xc1bccfe4 // sclamp { z4.s-z7.s }, z31.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
"add x11, x11, x9\n"
- "st1b { z1.s }, p1, [x10]\n"
+ "st1b { z5.s }, p1, [x10]\n"
"add x10, x10, x28\n"
- "st1b { z2.s }, p1, [x27]\n"
+ "st1b { z6.s }, p1, [x27]\n"
"add x27, x27, x25\n"
- "st1b { z3.s }, p1, [x26]\n"
+ "st1b { z7.s }, p1, [x26]\n"
"add x26, x26, x24\n"
"bgt 20b\n"
"21:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
- "incw x16\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "whilelt p1.s, x16, x17\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -869,6 +874,8 @@ void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #6\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
index 64023eeaff..baaf51c711 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,249 +70,254 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
"ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
- "ptrue p2.b\n"
+ "mov x22, SP\n"
"mov x20, #0x8\n"
+ "ptrue p2.b\n"
"ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x5\n"
- ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x21, x22, #0x8\n"
"ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x21, x21, #-0x400\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "sub x20, x20, x5\n"
+ "mov SP, x21\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
+ "addvl SP, SP, #-30\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
"whilelt p1.s, XZR, x7\n"
"whilelt p9.s, XZR, x20\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "neg z15.h, p2/M, z15.h\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
"whilelt p8.s, XZR, x6\n"
- "addvl SP, SP, #-30\n"
- "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z17.h, p2/M, z17.h\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z18.s, #0x0\n"
+ "mov z28.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x23\n"
- "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z13.h, #0x0\n"
+ "addvl x22, SP, #30\n"
+ "addvl x22, x22, #-6\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z29.d, z28.d\n"
+ "mov x23, x24\n"
+ "incw x24\n"
+ "ld1sb { z22.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z21.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z19.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "ld1sb { z25.s }, p2/Z, [x23]\n"
+ "incw x23, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "ld1sb { z5.s }, p2/Z, [x23]\n"
+ "mov x20, x24\n"
+ "incw x24\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "trn1 z6.h, z13.h, z22.h\n"
+ "trn1 z23.h, z22.h, z21.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z15.h, #0x0\n"
- "sub z2.h, z2.h, z3.h\n"
- "incw x23\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "trn1 z4.h, z21.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z13.h, z13.h, z3.h\n"
- "trn1 z11.h, z15.h, z2.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "trn1 z26.h, z19.h, z25.h\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z27.h, z27.h, z3.h\n"
- "trn1 z0.h, z2.h, z13.h\n"
+ "trn1 z22.h, z25.h, z5.h\n"
+ "ld1sb { z7.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z25.h, z5.h, z13.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z18.h, z18.h, z0.h\n"
+ "st1h { z6.h }, p2, [x22]\n"
+ "incw x24\n"
+ "sub z7.h, z7.h, z0.h\n"
+ "st1h { z23.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z1.h, z1.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z20.h, z13.h, z27.h\n"
+ "trn1 z12.h, z27.h, z9.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z2.h, z9.h, z18.h\n"
"ld1sb { z19.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z19.h, z19.h, z3.h\n"
- "trn1 z26.h, z13.h, z27.h\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z9.h, z18.h, z7.h\n"
"ld1sb { z14.s }, p2/Z, [x20]\n"
- "sub z14.h, z14.h, z3.h\n"
- "mov x20, x23\n"
- "trn1 z10.h, z27.h, z19.h\n"
- "ld1sb { z9.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z19.h, z19.h, z14.h\n"
- "trn1 z1.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z18.h, z7.h, z1.h\n"
"ld1sb { z5.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z25.h, z1.h, z13.h\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "addvl x22, x22, #-6\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "mov x20, x24\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "sub z5.h, z5.h, z0.h\n"
+ "st1h { z12.h }, p2, [x22, #1, MUL VL]\n"
+ "incw x24\n"
+ "st1h { z2.h }, p2, [x22, #2, MUL VL]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "trn1 z7.h, z13.h, z21.h\n"
+ "trn1 z20.h, z21.h, z19.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z29.h, z29.h, z3.h\n"
- "addvl x22, SP, #30\n"
+ "trn1 z17.h, z19.h, z14.h\n"
+ "st1h { z9.h }, p2, [x22, #3, MUL VL]\n"
"ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "incw x23\n"
- "sub z2.h, z2.h, z3.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z23.h, z23.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "trn1 z20.h, z15.h, z9.h\n"
- "incw x23\n"
- "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z22.h, z9.h, z5.h\n"
- "ld1sb { z25.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z9.h, z5.h, z29.h\n"
+ "trn1 z12.h, z14.h, z5.h\n"
+ "st1h { z18.h }, p2, [x22, #4, MUL VL]\n"
"ld1sb { z21.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z26.h, z29.h, z2.h\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z28.h, z2.h, z23.h\n"
- "ld1sb { z19.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z23.h, z15.h\n"
- "sub z25.h, z25.h, z3.h\n"
+ "st1h { z25.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "sub z21.h, z21.h, z3.h\n"
- "ld1sb { z6.s }, p2/Z, [x20]\n"
- "sub z0.h, z0.h, z3.h\n"
- "mov x20, x23\n"
- "sub z19.h, z19.h, z3.h\n"
- "sub z6.h, z6.h, z3.h\n"
- "st1h { z20.h }, p2, [x22]\n"
- "incw x23\n"
- "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z11.h, z15.h, z25.h\n"
- "trn1 z10.h, z25.h, z21.h\n"
- "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "trn1 z5.h, z5.h, z16.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z14.h, z21.h, z0.h\n"
+ "trn1 z4.h, z16.h, z13.h\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "mov x20, x24\n"
+ "sub z21.h, z21.h, z0.h\n"
+ "st1h { z7.h }, p2, [x22]\n"
+ "sub z25.h, z25.h, z0.h\n"
+ "st1h { z20.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z19.h, z19.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z1.h, z13.h, z6.h\n"
+ "trn1 z24.h, z6.h, z2.h\n"
"ld1sb { z23.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z21.h, z0.h, z19.h\n"
- "ld1sb { z27.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z19.h, z19.h, z6.h\n"
- "ld1sb { z29.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z13.h, z6.h, z15.h\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z23.h, z23.h, z3.h\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
- "addvl x22, x22, #-6\n"
- "sub z27.h, z27.h, z3.h\n"
- "sub z29.h, z29.h, z3.h\n"
- "mov x20, x23\n"
- "st1h { z11.h }, p2, [x22]\n"
- "sub z1.h, z1.h, z3.h\n"
- "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
- "trn1 z30.h, z15.h, z5.h\n"
- "trn1 z26.h, z5.h, z23.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "trn1 z16.h, z2.h, z21.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
- "trn1 z22.h, z23.h, z27.h\n"
- "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "st1h { z12.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z20.h, z21.h, z25.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z28.h, z27.h, z29.h\n"
- "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "st1h { z5.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z17.h, z25.h, z19.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z27.h, z29.h, z1.h\n"
- "ld1sb { z9.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
- "trn1 z2.h, z1.h, z15.h\n"
- "ld1sb { z14.s }, p2/Z, [x20]\n"
- "sub z11.h, z11.h, z3.h\n"
+ "st1h { z4.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z19.h, z19.h, z13.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z23.h, z23.h, z0.h\n"
"addvl x22, x22, #-6\n"
- "sub z5.h, z5.h, z3.h\n"
- "sub z8.h, z8.h, z3.h\n"
- "st1h { z30.h }, p2, [x22]\n"
- "sub z9.h, z9.h, z3.h\n"
- "sub z14.h, z14.h, z3.h\n"
- "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
- "mov z19.d, z18.d\n"
- "trn1 z22.h, z15.h, z11.h\n"
- "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
- "trn1 z1.h, z11.h, z5.h\n"
- "trn1 z31.h, z5.h, z8.h\n"
- "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
- "trn1 z8.h, z8.h, z9.h\n"
- "trn1 z21.h, z9.h, z14.h\n"
- "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "sub z6.h, z6.h, z0.h\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z1.h }, p2, [x22]\n"
+ "sub z22.h, z22.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z16.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z16.h, z13.h, z23.h\n"
+ "st1h { z17.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z7.h, z23.h, z6.h\n"
+ "trn1 z12.h, z6.h, z14.h\n"
+ "st1h { z19.h }, p2, [x22, #5, MUL VL]\n"
"addvl x22, x22, #-6\n"
- "trn1 z15.h, z14.h, z15.h\n"
- "st1h { z22.h }, p2, [x22]\n"
- "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
- "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
- "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
- "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
- "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z5.h, z14.h, z22.h\n"
+ "trn1 z14.h, z22.h, z27.h\n"
+ "trn1 z20.h, z27.h, z13.h\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "st1h { z7.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z12.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z5.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z14.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z20.h }, p2, [x22, #5, MUL VL]\n"
"cbz x21, 3f\n"
- "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "ld1w { z8.s }, p1/Z, [x21, x17, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x20, x17, LSL #2]\n"
"4:" // Load right_shift: End
"ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x25, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x7, x23, LSL #22\n"
"mov x22, #0x8\n"
- "add x21, x6, x5\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x6, x5\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x11, #0x0\n"
"mov x8, #0x8\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x6, x16\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x25, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x6, x16\n"
+ "orr x20, x7, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x16, x6, x20, x16\n"
- ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0046b80 // mova za.d[x11, #0], { z28.d-z29.d }\n"
"mov x22, #0x4\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x6, x21, x16\n"
+ ".inst 0xc0046b81 // mova za.d[x11, #1], { z28.d-z29.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046b82 // mova za.d[x11, #2], { z28.d-z29.d }\n"
"ldp x14, x13, [x23], #0x10\n"
- ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ ".inst 0xc0046b83 // mova za.d[x11, #3], { z28.d-z29.d }\n"
"ldp x4, x10, [x20], #0x10\n"
- ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
- "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ ".inst 0xc0046b84 // mova za.d[x11, #4], { z28.d-z29.d }\n"
+ ".inst 0xc0046b85 // mova za.d[x11, #5], { z28.d-z29.d }\n"
"ldp x9, x28, [x23], #0x10\n"
- ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ ".inst 0xc0046b86 // mova za.d[x11, #6], { z28.d-z29.d }\n"
"ldp x27, x26, [x20], #0x10\n"
- ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
- ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc0046b87 // mova za.d[x11, #7], { z28.d-z29.d }\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
"sub x21, x21, x20\n"
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
- ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066810 // mova { z16.d-z17.d }, za.d[x11, #0]\n"
"sub x15, x15, x21\n"
- ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
- ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
- ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
- ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ ".inst 0xc0066832 // mova { z18.d-z19.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ ".inst 0xc1abaa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc1becff0 // sclamp { z16.s-z19.s }, z31.s, z30.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z20.s }, p1, [x14]\n"
+ "st1b { z16.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z22.s }, p1, [x13]\n"
+ "st1b { z18.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z21.s }, p1, [x9]\n"
+ "st1b { z17.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z23.s }, p1, [x28]\n"
+ "st1b { z19.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
@@ -328,331 +333,331 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
"add x21, x16, %x[ld_in_row]\n"
- "ld1b { z1.s }, p1/Z, [x16]\n"
+ "ld1b { z4.s }, p1/Z, [x16]\n"
"addvl x20, SP, #24\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z1.h, z28.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1b { z1.s }, p1/Z, [x21]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z2.s }, p1/Z, [x21]\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z1.h, z2.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1b { z13.s }, p1/Z, [x21]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z6.s }, p1/Z, [x21]\n"
+ "trn1 z22.h, z4.h, z13.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z13.h, z6.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z23.h, z25.h, z19.h\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z14.h, z27.h\n"
"ld1b { z20.s }, p1/Z, [x21]\n"
- "trn1 z30.h, z30.h, z20.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "add z30.h, z30.h, z17.h\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
- ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16d76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16c76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z12.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16e76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
"9:" // Unpadded: 3 priming loads
"add x22, x16, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x16]\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
"addvl x21, SP, #18\n"
- "ld1b { z28.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z20.h, z2.h, z28.h\n"
- "add z20.h, z20.h, z17.h\n"
- "ld1b { z31.s }, p1/Z, [x22]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z11.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z21.h, z31.h, z11.h\n"
- "add z21.h, z21.h, z17.h\n"
- "ld1b { z25.s }, p1/Z, [x22]\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z22.h, z25.h, z8.h\n"
- "add z22.h, z22.h, z17.h\n"
- "ld1b { z8.s }, p1/Z, [x22]\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
- "ld1b { z3.s }, p1/Z, [x22]\n"
- "trn1 z23.h, z8.h, z3.h\n"
- ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
- "add z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z21.h, z18.h\n"
+ "ld1b { z7.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z3.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z26.h, z27.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
- ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
- ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
- ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
- ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
- ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
- ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "trn1 z27.h, z17.h, z16.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc16c770b // sdot za.s[x11, 3], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1697728 // sdot za.s[x11, 0], { z25.h-z26.h }, z9.h\n"
+ ".inst 0xc1617729 // sdot za.s[x11, 1], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc1677748 // sdot za.s[x11, 0], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc1667749 // sdot za.s[x11, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc16c774b // sdot za.s[x11, 3], { z26.h-z27.h }, z12.h\n"
"10:" // Unpadded: 2 priming loads
"add x23, x16, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x16]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
"addvl x22, SP, #12\n"
- "ld1b { z22.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z0.h, z2.h, z22.h\n"
- "add z0.h, z0.h, z17.h\n"
- "ld1b { z14.s }, p1/Z, [x23]\n"
+ "ld1b { z19.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
- "ld1b { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z1.h, z14.h, z6.h\n"
- "add z1.h, z1.h, z17.h\n"
- "ld1b { z15.s }, p1/Z, [x23]\n"
+ "ld1b { z4.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- "ld1b { z6.s }, p1/Z, [x23]\n"
- "add x23, x23, %x[ld_in_row]\n"
- "trn1 z2.h, z15.h, z6.h\n"
- "add z2.h, z2.h, z17.h\n"
- "ld1b { z21.s }, p1/Z, [x23]\n"
+ "ld1b { z3.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
"add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
- "ld1b { z30.s }, p1/Z, [x23]\n"
- "trn1 z3.h, z21.h, z30.h\n"
- ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
- ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
- "add z3.h, z3.h, z17.h\n"
- ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
- ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
- ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
- ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
- ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
- ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
- ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
- ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
- ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
- ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
- ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z0.h, z19.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z4.h, z3.h\n"
+ "ld1b { z9.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xa0402ac4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x22]\n"
+ "add z22.h, z22.h, z15.h\n"
+ "trn1 z24.h, z17.h, z25.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "add z23.h, z23.h, z15.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "trn1 z25.h, z9.h, z17.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc16576ca // sdot za.s[x11, 2], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cb // sdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16776e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16576cc // sdot za.s[x11, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16476cd // sdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xc16776ea // sdot za.s[x11, 2], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa0412a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617708 // sdot za.s[x11, 0], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xc1607709 // sdot za.s[x11, 1], { z24.h-z25.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16576ec // sdot za.s[x11, 4], { z23.h-z24.h }, z5.h\n"
+ ".inst 0xc16476ed // sdot za.s[x11, 5], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc167770a // sdot za.s[x11, 2], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1422a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d770c // sdot za.s[x11, 4], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc165770d // sdot za.s[x11, 5], { z24.h-z25.h }, z5.h\n"
"11:" // Unpadded: 1 priming loads
"add x24, x16, %x[ld_in_row]\n"
- "ld1b { z0.s }, p1/Z, [x16]\n"
+ "ld1b { z16.s }, p1/Z, [x16]\n"
"addvl x23, SP, #6\n"
- "ld1b { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z28.h, z0.h, z3.h\n"
- "add z28.h, z28.h, z17.h\n"
- "ld1b { z6.s }, p1/Z, [x24]\n"
+ "ld1b { z22.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x22, SP, #12\n"
- "ld1b { z30.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
- "trn1 z29.h, z6.h, z30.h\n"
- "add z29.h, z29.h, z17.h\n"
- "ld1b { z1.s }, p1/Z, [x24]\n"
+ "ld1b { z19.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"ld1b { z25.s }, p1/Z, [x24]\n"
"add x24, x24, %x[ld_in_row]\n"
- "trn1 z30.h, z1.h, z25.h\n"
- "add z30.h, z30.h, z17.h\n"
- "ld1b { z3.s }, p1/Z, [x24]\n"
- "add x24, x24, %x[ld_in_row]\n"
"addvl x20, SP, #24\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z18.h, z16.h, z22.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z5.s }, p1/Z, [x24]\n"
- "trn1 z31.h, z3.h, z5.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
- "add z31.h, z31.h, z17.h\n"
- ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
- ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
- ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
- ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
- ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
- ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
- ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
- ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
- ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
- ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
- ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
- ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
- ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
- ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
- ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "ld1b { z4.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z19.h, z19.h, z25.h\n"
+ "ld1b { z27.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xa1402ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23]\n"
+ "add z18.h, z18.h, z15.h\n"
+ "trn1 z20.h, z6.h, z4.h\n"
+ "ld1b { z22.s }, p1/Z, [x24]\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "add z19.h, z19.h, z15.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "trn1 z21.h, z27.h, z22.h\n"
+ "add z20.h, z20.h, z15.h\n"
+ ".inst 0xc1697648 // sdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc1617649 // sdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "add z21.h, z21.h, z15.h\n"
+ ".inst 0xc16c764a // sdot za.s[x11, 2], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xc164764b // sdot za.s[x11, 3], { z18.h-z19.h }, z4.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d7668 // sdot za.s[x11, 0], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1657669 // sdot za.s[x11, 1], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163764c // sdot za.s[x11, 4], { z18.h-z19.h }, z3.h\n"
+ ".inst 0xc162764d // sdot za.s[x11, 5], { z18.h-z19.h }, z2.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16c766a // sdot za.s[x11, 2], { z19.h-z20.h }, z12.h\n"
+ ".inst 0xc164766b // sdot za.s[x11, 3], { z19.h-z20.h }, z4.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169764e // sdot za.s[x11, 6], { z18.h-z19.h }, z9.h\n"
+ ".inst 0xc161764f // sdot za.s[x11, 7], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xc163766c // sdot za.s[x11, 4], { z19.h-z20.h }, z3.h\n"
+ ".inst 0xc162766d // sdot za.s[x11, 5], { z19.h-z20.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c768a // sdot za.s[x11, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768b // sdot za.s[x11, 3], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa1422aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169766e // sdot za.s[x11, 6], { z19.h-z20.h }, z9.h\n"
+ ".inst 0xc161766f // sdot za.s[x11, 7], { z19.h-z20.h }, z1.h\n"
+ ".inst 0xc16c768c // sdot za.s[x11, 4], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc164768d // sdot za.s[x11, 5], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xa0422a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16d768e // sdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc16c768f // sdot za.s[x11, 7], { z20.h-z21.h }, z12.h\n"
"12:" // Unpadded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x16]\n"
+ "ld1b { z6.s }, p1/Z, [x16]\n"
"sub x25, x25, #0x1\n"
- "ld1b { z28.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z26.h, z28.h\n"
"sub x15, x15, #0x1\n"
- "ld1b { z31.s }, p1/Z, [x20]\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"cmp x25, x15\n"
- "add z25.h, z25.h, z17.h\n"
- "ld1b { z15.s }, p1/Z, [x20]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z31.h, z15.h\n"
"csel x25, x25, x15, LT\n"
- "ld1b { z22.s }, p1/Z, [x20]\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z26.h, z26.h, z17.h\n"
+ "trn1 z24.h, z6.h, z13.h\n"
"add x16, x16, %x[ld_in_col]\n"
- "ld1b { z8.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z22.h, z8.h\n"
- "add z27.h, z27.h, z17.h\n"
- "ld1b { z21.s }, p1/Z, [x20]\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
"sub x15, x15, x25\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
- "trn1 z28.h, z21.h, z20.h\n"
- "add z28.h, z28.h, z17.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z21.h, z19.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "trn1 z26.h, z20.h, z13.h\n"
+ "add z24.h, z24.h, z15.h\n"
+ "trn1 z27.h, z22.h, z16.h\n"
+ "add z25.h, z25.h, z15.h\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
"cbz x25, 21f\n"
"13:" // Unpadded: Main loop
"addvl x24, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x23, SP, #12\n"
- "ld1b { z21.s }, p1/Z, [x16]\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "ld1b { z23.s }, p1/Z, [x16]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
"addvl x22, SP, #18\n"
"addvl x21, SP, #24\n"
- ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
"add x20, x16, %x[ld_in_row]\n"
- "ld1b { z0.s }, p1/Z, [x20]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"subs x25, x25, #0x1\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "ld1b { z20.s }, p1/Z, [x20]\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
- ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc169770a // sdot za.s[x11, 2], { z24.h-z25.h }, z9.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1b { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xc161770b // sdot za.s[x11, 3], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
"ld1b { z22.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
- ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
- "ld1b { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16c772a // sdot za.s[x11, 2], { z25.h-z26.h }, z12.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc164772b // sdot za.s[x11, 3], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
- ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
- "ld1b { z6.s }, p1/Z, [x20]\n"
- ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16d770e // sdot za.s[x11, 6], { z24.h-z25.h }, z13.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc165770f // sdot za.s[x11, 7], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- "trn1 z25.h, z21.h, z0.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
- "add z25.h, z25.h, z17.h\n"
- ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
- "trn1 z26.h, z20.h, z31.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- "add z26.h, z26.h, z17.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
- "trn1 z27.h, z29.h, z22.h\n"
- "trn1 z28.h, z30.h, z6.h\n"
+ ".inst 0xc161774c // sdot za.s[x11, 4], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774d // sdot za.s[x11, 5], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa0422ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc167774e // sdot za.s[x11, 6], { z26.h-z27.h }, z7.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1422aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ "trn1 z24.h, z23.h, z19.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ "trn1 z25.h, z21.h, z20.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "add z24.h, z24.h, z15.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ "trn1 z26.h, z22.h, z18.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "add z27.h, z27.h, z17.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add z25.h, z25.h, z15.h\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "add z26.h, z26.h, z15.h\n"
+ "add z27.h, z27.h, z15.h\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "add z28.h, z28.h, z17.h\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 13b\n"
"b 21f\n"
@@ -667,513 +672,513 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z9.s }, p0/Z, [x16]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
"add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z9.h, z22.h\n"
- "trn1 z0.h, z21.h, z20.h\n"
+ "add z26.h, p0/M, z26.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x21]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z26.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16e76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z14.h\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xc16676c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z6.h\n"
+ "add z25.h, p0/M, z25.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z1.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z25.h\n"
+ "add z1.h, p0/M, z1.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
- "trn1 z1.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
- ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- "trn1 z2.h, z21.h, z20.h\n"
- ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
- ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
- ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
- ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ ".inst 0xc16d76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z13.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ ".inst 0xc16c76e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z25.h, z1.h, z16.h\n"
+ ".inst 0xc1637708 // sdot za.s[x11, 0], { z24.h-z25.h }, z3.h\n"
+ ".inst 0xc1627709 // sdot za.s[x11, 1], { z24.h-z25.h }, z2.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z5.s }, p0/Z, [x16]\n"
- "add z5.h, p0/M, z5.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z28.h, z5.h, z22.h\n"
- "trn1 z29.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z1.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z14.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1667409 // sdot za.s[x11, 1], { z0.h-z1.h }, z6.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e740a // sdot za.s[x11, 2], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xc166740b // sdot za.s[x11, 3], { z0.h-z1.h }, z6.h\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z2.h, z18.h, z17.h\n"
+ "add z0.h, p0/M, z0.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- "trn1 z30.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #24\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z21.h, z20.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
- ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
- ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
- ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
- ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
- ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
- ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
- ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16c7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z12.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc1647429 // sdot za.s[x11, 1], { z1.h-z2.h }, z4.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0xc16e742a // sdot za.s[x11, 2], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc166742b // sdot za.s[x11, 3], { z1.h-z2.h }, z6.h\n"
+ "trn1 z3.h, z0.h, z17.h\n"
+ ".inst 0xc16d7448 // sdot za.s[x11, 0], { z2.h-z3.h }, z13.h\n"
+ ".inst 0xc1657449 // sdot za.s[x11, 1], { z2.h-z3.h }, z5.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e744a // sdot za.s[x11, 2], { z2.h-z3.h }, z14.h\n"
+ ".inst 0xc166744b // sdot za.s[x11, 3], { z2.h-z3.h }, z6.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x16]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z22.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z8.h, z29.h, z22.h\n"
- "trn1 z9.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16376c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z3.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc16276c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z2.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z17.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ ".inst 0xc16976cc // sdot za.s[x11, 4], { z22.h-z23.h }, z9.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
- "trn1 z10.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x21, SP, #18\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- "addvl x20, SP, #24\n"
- "trn1 z11.h, z21.h, z20.h\n"
- ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
- ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
- ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
- ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
- ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
- ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
- ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
- ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
- ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc16176cd // sdot za.s[x11, 5], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16c76e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z12.h\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xc16476e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ea // sdot za.s[x11, 2], { z23.h-z24.h }, z14.h\n"
+ ".inst 0xc16676eb // sdot za.s[x11, 3], { z23.h-z24.h }, z6.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16d7708 // sdot za.s[x11, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1657709 // sdot za.s[x11, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa0422aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc165770a // sdot za.s[x11, 2], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770b // sdot za.s[x11, 3], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x24, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x16]\n"
- "add z1.h, p0/M, z1.h, z17.h\n"
- "add x20, x16, %x[ld_in_row]\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa0412ae4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z21.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z1.h, z22.h\n"
- "trn1 z27.h, z21.h, z20.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z22.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e76a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z14.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16676a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "addvl x23, SP, #6\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16776aa // sdot za.s[x11, 2], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc16676ab // sdot za.s[x11, 3], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z17.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z23.h, z18.h, z16.h\n"
+ ".inst 0xc16776ac // sdot za.s[x11, 4], { z21.h-z22.h }, z7.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
- "trn1 z28.h, z22.h, z20.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
- "addvl x22, SP, #12\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
- ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
- ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
- "addvl x21, SP, #18\n"
- "trn1 z29.h, z21.h, z20.h\n"
- ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
- "addvl x20, SP, #24\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
- ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc16676ad // sdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16576c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z5.h\n"
+ "ld1b { z16.s }, p0/Z, [x24]\n"
+ ".inst 0xc16476c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16d76ae // sdot za.s[x11, 6], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc16c76af // sdot za.s[x11, 7], { z21.h-z22.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0xc16e76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cb // sdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
- ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
- ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
- ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
- ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
- ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
- ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
- ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
- ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16e76cc // sdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc16676cd // sdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa0412a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976e8 // sdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176e9 // sdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16776ce // sdot za.s[x11, 6], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xc16676cf // sdot za.s[x11, 7], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc16176ea // sdot za.s[x11, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16076eb // sdot za.s[x11, 3], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16976ec // sdot za.s[x11, 4], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc16176ed // sdot za.s[x11, 5], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1422a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16c76ee // sdot za.s[x11, 6], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc16476ef // sdot za.s[x11, 7], { z23.h-z24.h }, z4.h\n"
"19:" // Padded: 0 priming loads
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
"cbz x25, 22f\n"
"mov x12, #0x0\n"
- ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z6.s }, p0/Z, [x16]\n"
- "add z6.h, p0/M, z6.h, z17.h\n"
"add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "ld1b { z18.s }, p0/Z, [x16]\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z30.s }, p0/Z, [x20]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z17.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z6.h, z30.h\n"
- "trn1 z26.h, z27.h, z26.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z17.h, z16.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z9.s }, p0/Z, [x20]\n"
- "add z9.h, p0/M, z9.h, z17.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z21.s }, p0/Z, [x20]\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- "sub x25, x25, #0x1\n"
- "sub x15, x15, #0x1\n"
- "cmp x25, x15\n"
- "trn1 z27.h, z8.h, z9.h\n"
- "trn1 z28.h, z21.h, z29.h\n"
- "csel x25, x25, x15, LT\n"
- "add x16, x16, %x[ld_in_col]\n"
- "sub x15, x15, x25\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ "trn1 z27.h, z17.h, z16.h\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
"mov x12, #0x0\n"
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z8.s }, p0/Z, [x16]\n"
- "add z8.h, p0/M, z8.h, z17.h\n"
- "add x24, x16, %x[ld_in_row]\n"
- ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z21.s }, p0/Z, [x24]\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
- "addvl x23, SP, #6\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
- "addvl x22, SP, #12\n"
- "add z21.h, p0/M, z21.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24]\n"
+ "addvl x23, SP, #12\n"
+ "add x22, x16, %x[ld_in_row]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- "ld1b { z29.s }, p0/Z, [x24]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- "add z29.h, p0/M, z29.h, z17.h\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
- "mov x12, #0x4\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z16.s }, p0/Z, [x16]\n"
+ ".inst 0xc16d770a // sdot za.s[x11, 2], { z24.h-z25.h }, z13.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc165770b // sdot za.s[x11, 3], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa0402aa4 // ld1h { z4.h-z5.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc169772a // sdot za.s[x11, 2], { z25.h-z26.h }, z9.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc161772b // sdot za.s[x11, 3], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ "add z19.h, p0/M, z19.h, z15.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc165770e // sdot za.s[x11, 6], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
- "ld1b { z30.s }, p0/Z, [x24]\n"
- "add z30.h, p0/M, z30.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ "add z23.h, p0/M, z23.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16d772e // sdot za.s[x11, 6], { z25.h-z26.h }, z13.h\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc165772f // sdot za.s[x11, 7], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc167774c // sdot za.s[x11, 4], { z26.h-z27.h }, z7.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa0422aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "subs x25, x25, #0x1\n"
- ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
- "ld1b { z15.s }, p0/Z, [x24]\n"
- "add z15.h, p0/M, z15.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc163774e // sdot za.s[x11, 6], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774f // sdot za.s[x11, 7], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16c1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z12.h\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "add x16, x16, %x[ld_in_col]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- "ld1b { z20.s }, p0/Z, [x24]\n"
- "add z20.h, p0/M, z20.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1641709 // sdot za.s[x8, 1], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402be1 // ld1h { z1.h, z9.h }, pn10.b/Z, [SP]\n"
+ "trn1 z24.h, z16.h, z19.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16d1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc1651729 // sdot za.s[x8, 1], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z25.h, z23.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z15.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
- "ld1b { z31.s }, p0/Z, [x24]\n"
- "add z31.h, p0/M, z31.h, z17.h\n"
- "add x24, x24, %x[ld_in_row]\n"
- ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
- "ld1b { z22.s }, p0/Z, [x24]\n"
- "add z22.h, p0/M, z22.h, z17.h\n"
- ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
- ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
- ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
- ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
- ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
- ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
- ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
- "trn1 z25.h, z8.h, z21.h\n"
- ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
- ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
- ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
- "trn1 z26.h, z29.h, z30.h\n"
- ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc1631748 // sdot za.s[x8, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1621749 // sdot za.s[x8, 1], { z26.h-z27.h }, z2.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
"add x8, x8, #0x2\n"
".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
- "trn1 z27.h, z15.h, z20.h\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- "trn1 z28.h, z31.h, z22.h\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "trn1 z26.h, z18.h, z16.h\n"
+ "add z17.h, p0/M, z17.h, z15.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ "add z18.h, p0/M, z18.h, z15.h\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ "trn1 z27.h, z17.h, z18.h\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 20b\n"
"21:" // Main loop tail
"addvl x23, SP, #6\n"
- ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc1697708 // sdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
"addvl x22, SP, #12\n"
- ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
- ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617709 // sdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #18\n"
"addvl x20, SP, #24\n"
- ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
- ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xc16e770a // sdot za.s[x11, 2], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770b // sdot za.s[x11, 3], { z24.h-z25.h }, z6.h\n"
".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16c7728 // sdot za.s[x11, 0], { z25.h-z26.h }, z12.h\n"
+ ".inst 0xc1647729 // sdot za.s[x11, 1], { z25.h-z26.h }, z4.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e770c // sdot za.s[x11, 4], { z24.h-z25.h }, z14.h\n"
+ ".inst 0xc166770d // sdot za.s[x11, 5], { z24.h-z25.h }, z6.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d772a // sdot za.s[x11, 2], { z25.h-z26.h }, z13.h\n"
+ ".inst 0xc165772b // sdot za.s[x11, 3], { z25.h-z26.h }, z5.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637748 // sdot za.s[x11, 0], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc1627749 // sdot za.s[x11, 1], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa0422ae2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16c770e // sdot za.s[x11, 6], { z24.h-z25.h }, z12.h\n"
+ ".inst 0xc164770f // sdot za.s[x11, 7], { z24.h-z25.h }, z4.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
- ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc163774a // sdot za.s[x11, 2], { z26.h-z27.h }, z3.h\n"
+ ".inst 0xc162774b // sdot za.s[x11, 3], { z26.h-z27.h }, z2.h\n"
+ ".inst 0xa1422ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
- ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
- ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
- ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
- ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
- ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
- ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
- ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
- ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
- ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
- ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
- ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
- ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
- ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16c774c // sdot za.s[x11, 4], { z26.h-z27.h }, z12.h\n"
+ ".inst 0xc164774d // sdot za.s[x11, 5], { z26.h-z27.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc161774e // sdot za.s[x11, 6], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc160774f // sdot za.s[x11, 7], { z26.h-z27.h }, z0.h\n"
".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
- ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
- ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
- ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
- ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
- ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
- ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xc16d1708 // sdot za.s[x8, 0], { z24.h-z25.h }, z13.h\n"
+ ".inst 0xc1651709 // sdot za.s[x8, 1], { z24.h-z25.h }, z5.h\n"
+ ".inst 0xc16e1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1661729 // sdot za.s[x8, 1], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xc1611748 // sdot za.s[x8, 0], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xc1601749 // sdot za.s[x8, 1], { z26.h-z27.h }, z0.h\n"
"add x8, x8, #0x2\n"
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ ".inst 0xc1abaa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1aaab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1becff4 // sclamp { z20.s-z23.s }, z31.s, z30.s\n"
+ "st1b { z20.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z22.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z21.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z23.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"22:" // Main loop skip tail
"cbz x15, 24f\n"
"23:" // Right padding loop
- ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066818 // mova { z24.d-z25.d }, za.d[x11, #0]\n"
"add x8, x8, #0x2\n"
"subs x15, x15, #0x1\n"
- ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
- ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc006683a // mova { z26.d-z27.d }, za.d[x11, #1]\n"
"add x11, x11, #0x2\n"
- ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
- ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
- ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
- ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
- ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
- "st1b { z8.s }, p1, [x14]\n"
+ ".inst 0xc0040b80 // mova za.d[x8, #0], { z28.d-z29.d }\n"
+ ".inst 0xc0040b81 // mova za.d[x8, #1], { z28.d-z29.d }\n"
+ ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ ".inst 0xc1abaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z11.s\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1becff8 // sclamp { z24.s-z27.s }, z31.s, z30.s\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x4\n"
- "st1b { z10.s }, p1, [x13]\n"
+ "st1b { z26.s }, p1, [x13]\n"
"add x13, x13, x10\n"
- "st1b { z9.s }, p1, [x9]\n"
+ "st1b { z25.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- "st1b { z11.s }, p1, [x28]\n"
+ "st1b { z27.s }, p1, [x28]\n"
"add x28, x28, x26\n"
"bgt 23b\n"
"24:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x17\n"
- "whilelt p1.s, x17, x7\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1192,6 +1197,8 @@ void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #30\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
index d8dc69127e..d4708f8916 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -69,194 +69,199 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
__asm__ __volatile__(
".inst 0xd503477f // SMSTART ZA\n"
- "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x22, SP\n"
+ "mov x21, #0xb\n"
"ptrue p2.b\n"
- "mov x20, #0xb\n"
- "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
- "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
- "sub x20, x20, x3\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "sub x20, x22, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "ld1rh { z18.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "and x20, x20, #-0x400\n"
".inst 0x25207812 // ptrue pn10.b\n"
- "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
- "whilelt p1.s, XZR, x5\n"
- "whilelt p9.s, XZR, x20\n"
- "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "whilelt p8.s, XZR, x4\n"
+ "sub x21, x21, x4\n"
+ "mov SP, x20\n"
+ "ldr x7, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "str x22, [SP]\n"
"addvl SP, SP, #-15\n"
- "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
- "neg z7.h, p2/M, z7.h\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "whilelt p1.s, XZR, x6\n"
+ "whilelt p9.s, XZR, x21\n"
+ "neg z18.h, p2/M, z18.h\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "whilelt p8.s, XZR, x5\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
"eor p8.b, p2/Z, p8.b, p9.b\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
- "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
"1:" // Channel loop
"ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
- "mov z12.s, #0x0\n"
+ "mov z20.s, #0x0\n"
"cbz x20, 2f\n"
- "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x7, LSL #2]\n"
"2:" // Load bias: Done
- "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
- "mov x20, x22\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1rh { z0.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z12.h, #0x0\n"
+ "addvl x22, SP, #15\n"
+ "addvl x22, x22, #-3\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "mov z21.d, z20.d\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "sub z13.h, z13.h, z28.h\n"
- "incw x22\n"
- "mov z26.h, #0x0\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "trn1 z17.h, z13.h, z22.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z20.h, z20.h, z28.h\n"
- "addvl x21, SP, #15\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "sub z24.h, z24.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "sub z1.h, z1.h, z28.h\n"
- "trn1 z29.h, z20.h, z1.h\n"
+ "sub z30.h, z30.h, z0.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "incw x23\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "sub z17.h, z17.h, z0.h\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "trn1 z16.h, z24.h, z30.h\n"
"ld1sb { z27.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "sub z27.h, z27.h, z28.h\n"
- "incw x22\n"
- "ld1sb { z14.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z14.h, z14.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "ld1sb { z18.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z18.h, z18.h, z28.h\n"
- "trn1 z22.h, z27.h, z26.h\n"
- "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z23.h, z23.h, z28.h\n"
- "st1h { z17.h }, p2, [x21]\n"
- "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "trn1 z15.h, z8.h, z17.h\n"
+ "ld1sb { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z30.h, z30.h, z28.h\n"
- "trn1 z8.h, z14.h, z18.h\n"
- "ld1sb { z15.s }, p2/Z, [x20]\n"
- "mov x20, x22\n"
- "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
- "sub z15.h, z15.h, z28.h\n"
- "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z23.h, z23.h, z30.h\n"
- "sub z20.h, z20.h, z28.h\n"
- "ld1sb { z24.s }, p2/Z, [x20]\n"
- "incw x20, ALL, MUL #5\n"
- "sub z24.h, z24.h, z28.h\n"
- "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z26.h, z12.h\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z16.h }, p2, [x22]\n"
+ "sub z31.h, z31.h, z0.h\n"
+ "incw x23\n"
+ "sub z9.h, z9.h, z0.h\n"
+ "st1h { z15.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z11.h, z27.h, z11.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z0.h, z15.h, z26.h\n"
- "incw x22\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z16.h, z16.h, z28.h\n"
- "sub z13.h, z13.h, z28.h\n"
- "ld1sb { z11.s }, p2/Z, [x20]\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z8.h }, p2, [x21]\n"
- "trn1 z27.h, z20.h, z24.h\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "trn1 z13.h, z31.h, z9.h\n"
+ "ld1sb { z28.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z11.h, z11.h, z28.h\n"
- "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "sub z16.h, z16.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
- "trn1 z20.h, z16.h, z13.h\n"
- "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "trn1 z8.h, z10.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z28.h, z28.h, z0.h\n"
+ "incw x23\n"
+ "sub z26.h, z26.h, z0.h\n"
+ "st1h { z13.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z14.h, z14.h, z0.h\n"
+ "st1h { z8.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z13.h, z16.h, z2.h\n"
+ "ld1sb { z31.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z3.h, z3.h, z28.h\n"
- "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
- "trn1 z29.h, z11.h, z26.h\n"
+ "trn1 z30.h, z28.h, z26.h\n"
"ld1sb { z16.s }, p2/Z, [x20]\n"
- "incw x22\n"
- "sub z13.h, z13.h, z28.h\n"
- "sub z15.h, z15.h, z28.h\n"
- "addvl x21, x21, #-3\n"
- "mov x20, x22\n"
- "st1h { z27.h }, p2, [x21]\n"
- "sub z16.h, z16.h, z28.h\n"
- "trn1 z19.h, z22.h, z3.h\n"
- "ld1sb { z17.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
- "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z31.h, z31.h, z0.h\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z31.h, z13.h, z15.h\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "trn1 z17.h, z14.h, z12.h\n"
+ "sub z2.h, z2.h, z0.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
+ "mov x20, x23\n"
+ "st1h { z13.h }, p2, [x22]\n"
+ "sub z16.h, z16.h, z0.h\n"
+ "sub z27.h, z27.h, z0.h\n"
+ "st1h { z30.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z4.h, z4.h, z0.h\n"
+ "st1h { z17.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z31.h, z31.h, z2.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "trn1 z16.h, z16.h, z26.h\n"
- "sub z17.h, z17.h, z28.h\n"
- "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
"incw x20, ALL, MUL #5\n"
- "sub z0.h, z0.h, z28.h\n"
- "sub z18.h, z18.h, z28.h\n"
- "ld1sb { z1.s }, p2/Z, [x20]\n"
- "sub z22.h, z22.h, z28.h\n"
- "sub z1.h, z1.h, z28.h\n"
- "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
- "addvl x21, x21, #-3\n"
- "st1h { z19.h }, p2, [x21]\n"
- "mov z13.d, z12.d\n"
- "mov z14.d, z12.d\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "mov z15.d, z12.d\n"
- "trn1 z8.h, z17.h, z0.h\n"
- "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
- "addvl x21, x21, #-3\n"
- "trn1 z31.h, z18.h, z22.h\n"
- "trn1 z29.h, z1.h, z26.h\n"
- "st1h { z8.h }, p2, [x21]\n"
- "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
- "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
- "cbz x20, 3f\n"
- "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "trn1 z24.h, z16.h, z27.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z4.h, z4.h, z12.h\n"
+ "sub z29.h, z29.h, z0.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z10.h, z10.h, z0.h\n"
+ "st1h { z31.h }, p2, [x22]\n"
+ "sub z13.h, z13.h, z0.h\n"
+ "sub z8.h, z8.h, z0.h\n"
+ "st1h { z24.h }, p2, [x22, #1, MUL VL]\n"
+ "sub z11.h, z11.h, z0.h\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #-3\n"
+ "trn1 z14.h, z29.h, z10.h\n"
+ "trn1 z10.h, z13.h, z8.h\n"
+ "trn1 z4.h, z11.h, z12.h\n"
+ "st1h { z14.h }, p2, [x22]\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z4.h }, p2, [x22, #2, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z3.s }, p1/Z, [x21, x7, LSL #2]\n"
"3:" // Load mul: End
"ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
"cbz x20, 4f\n"
- "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "ld1w { z5.s }, p1/Z, [x20, x7, LSL #2]\n"
"4:" // Load right_shift: End
- "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
- "sub x20, x7, #0x1\n"
- "orr x23, x20, %x[ld_in_col], LSL #16\n"
- "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
- "orr x23, x5, x23, LSL #22\n"
+ "ldr x17, [%x[args], %[offsetof_Args_input_cols]]\n"
"mov x22, #0xb\n"
- "add x21, x4, x3\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "add x20, x5, x4\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
"mov x8, #0x0\n"
- "lsl x23, x23, #0x0\n"
- "sub x22, x22, x21\n"
- "madd x20, x20, x4, x17\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "sub x22, x22, x20\n"
+ "sub x20, x17, #0x1\n"
+ "orr x20, x20, %x[ld_in_col], LSL #16\n"
+ "madd x21, x21, x5, x16\n"
+ "orr x20, x6, x20, LSL #22\n"
+ "lsl x20, x20, #0x0\n"
"5:" // Issue prefetches
"subs x22, x22, #0x1\n"
- ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
- "add x20, x20, %x[ld_in_col]\n"
+ ".inst 0xf8b44abc // rprfm pldstrm, x20, [x21]\n"
+ "add x21, x21, %x[ld_in_col]\n"
"bgt 5b\n"
"ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
- "lsl x20, %x[ld_in_row], #0x0\n"
- "msub x17, x4, x20, x17\n"
- ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
- "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
- ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "lsl x21, %x[ld_in_row], #0x0\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
"mov x22, #0x4\n"
- "ldp x15, x14, [x23], #0x10\n"
- ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
- "ldp x13, x11, [x20], #0x10\n"
- ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ "msub x16, x5, x21, x16\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
"ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "ldp x10, x9, [x23], #0x10\n"
- "ldp x28, x27, [x20], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0040e83 // mova za.d[x8, #3], { z20.d-z23.d }\n"
+ "ldp x11, x10, [x20], #0x10\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ "ldp x27, x26, [x20], #0x10\n"
"cbz x21, 7f\n"
"cmp x21, x22\n"
"csel x20, x21, x22, LT\n"
@@ -264,379 +269,379 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"sub x22, x22, x20\n"
"cbz x21, 7f\n"
".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
"and x22, x21, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
"add x21, x21, #0x1\n"
"lsr x21, x21, #0x1\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- "sub x16, x16, x21\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
"6:" // Left padding
"subs x21, x21, #0x1\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "st1b { z28.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 6b\n"
"7:" // Left padding: End
- "adds XZR, x4, x3\n"
+ "adds XZR, x5, x4\n"
"bne 14f\n"
"cbz x22, 12f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 11f\n"
"cmp x22, #0x2\n"
"beq 10f\n"
"cmp x22, #0x3\n"
"beq 9f\n"
"8:" // Unpadded: 4 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z27.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x16]\n"
"addvl x20, SP, #12\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z0.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z9.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z11.s }, p1/Z, [x21]\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z11.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1b { z29.s }, p1/Z, [x21]\n"
+ "ld1b { z10.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z8.s }, p1/Z, [x21]\n"
+ "trn1 z8.h, z8.h, z26.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "add z29.h, z29.h, z7.h\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z31.h\n"
"ld1b { z30.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z17.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x21]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
+ "add z8.h, z8.h, z18.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z26.h\n"
- "add z31.h, z31.h, z7.h\n"
- ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1b { z20.s }, p1/Z, [x21]\n"
- "mov z0.d, z20.d\n"
- "add z0.h, z0.h, z7.h\n"
- ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "trn1 z11.h, z11.h, z30.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add z10.h, z10.h, z18.h\n"
+ "trn1 z12.h, z12.h, z28.h\n"
+ "ld1h { z4.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "mov z13.d, z2.d\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xc1701508 // sdot za.s[x8, 0], { z8.h-z11.h }, z0.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1711528 // sdot za.s[x8, 0], { z9.h-z12.h }, z1.h\n"
+ ".inst 0xc1741548 // sdot za.s[x8, 0], { z10.h-z13.h }, z4.h\n"
"9:" // Unpadded: 3 priming loads
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x17]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x16]\n"
"addvl x20, SP, #9\n"
- "ld1b { z17.s }, p1/Z, [x21]\n"
- "add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z17.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z0.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x21]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1b { z0.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1b { z1.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z2.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
"ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z1.h, z1.h, z16.h\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1b { z16.s }, p1/Z, [x21]\n"
- "mov z2.d, z16.d\n"
- "add z2.h, z2.h, z7.h\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "trn1 z14.h, z14.h, z24.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "trn1 z16.h, z16.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add z15.h, z15.h, z18.h\n"
+ "mov z17.d, z17.d\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ "add z17.h, z17.h, z18.h\n"
+ ".inst 0xc17b15a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z11.h\n"
+ ".inst 0xc17015c8 // sdot za.s[x8, 0], { z14.h-z17.h }, z0.h\n"
"10:" // Unpadded: 2 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1b { z26.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
"addvl x21, SP, #6\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #12\n"
- "ld1b { z16.s }, p1/Z, [x22]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z29.s }, p1/Z, [x22]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z28.h, z28.h, z29.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1b { z29.s }, p1/Z, [x22]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z19.s }, p1/Z, [x22]\n"
+ "ld1b { z14.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z19.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x22]\n"
+ "trn1 z12.h, z12.h, z26.h\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z23.s }, p1/Z, [x22]\n"
- "trn1 z30.h, z30.h, z23.h\n"
+ "ld1b { z15.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z30.h, z30.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
- "mov z31.d, z22.d\n"
- ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
- ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "trn1 z13.h, z13.h, z24.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add z13.h, z13.h, z18.h\n"
+ "trn1 z15.h, z15.h, z24.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z14.h, z14.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z15.h, z15.h, z18.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17115a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z1.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ ".inst 0xc1781589 // sdot za.s[x8, 1], { z12.h-z15.h }, z8.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"11:" // Unpadded: 1 priming loads
- "add x22, x17, %x[ld_in_row]\n"
- "ld1b { z29.s }, p1/Z, [x17]\n"
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z9.s }, p1/Z, [x16]\n"
"addvl x21, SP, #3\n"
- "ld1b { z22.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z22.h\n"
- "add z29.h, z29.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x22]\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
"addvl x20, SP, #9\n"
- "ld1b { z25.s }, p1/Z, [x22]\n"
+ "ld1b { z10.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z30.h, z30.h, z25.h\n"
- "add z30.h, z30.h, z7.h\n"
- "ld1b { z31.s }, p1/Z, [x22]\n"
- "add x22, x22, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "add x16, x16, %x[ld_in_col]\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- "add z31.h, z31.h, z7.h\n"
- "ld1b { z0.s }, p1/Z, [x22]\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z9.h, z9.h, z4.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z10.h, z10.h, z16.h\n"
"ld1b { z16.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "trn1 z0.h, z0.h, z16.h\n"
- "add z0.h, z0.h, z7.h\n"
- "ld1b { z1.s }, p1/Z, [x22]\n"
+ "ld1b { z13.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1b { z2.s }, p1/Z, [x22]\n"
- "trn1 z1.h, z1.h, z2.h\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "add z9.h, z9.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "add z1.h, z1.h, z7.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
- "ld1b { z24.s }, p1/Z, [x22]\n"
- "mov z2.d, z24.d\n"
- ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
- ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
- "add z2.h, z2.h, z7.h\n"
- "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
- ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z10.h, z10.h, z18.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "trn1 z13.h, z13.h, z17.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z12.h, z12.h, z18.h\n"
+ "mov z14.d, z16.d\n"
+ "add z13.h, z13.h, z18.h\n"
+ ".inst 0xc1701528 // sdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add z14.h, z14.h, z18.h\n"
+ ".inst 0xc1711548 // sdot za.s[x8, 0], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701529 // sdot za.s[x8, 1], { z9.h-z12.h }, z0.h\n"
+ ".inst 0xc1741568 // sdot za.s[x8, 0], { z11.h-z14.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711549 // sdot za.s[x8, 1], { z10.h-z13.h }, z1.h\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
"12:" // Unpadded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
- "add x21, x17, %x[ld_in_row]\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- "sub x7, x7, #0x2\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
+ "sub x17, x17, #0x2\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z25.h\n"
- "sub x16, x16, #0x1\n"
- "ld1b { z24.s }, p1/Z, [x21]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "lsr x20, x7, #0x1\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "lsr x20, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z30.h\n"
- "cmp x20, x16\n"
- "ld1b { z25.s }, p1/Z, [x21]\n"
+ "cmp x20, x15\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "csel x26, x20, x16, LT\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ "csel x25, x20, x15, LT\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z22.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z14.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z22.s }, p1/Z, [x21]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "and x17, x17, #0x1\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z22.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x21]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "and x7, x7, #0x1\n"
- "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z13.h, z13.h, z4.h\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z30.h\n"
- "add z27.h, z27.h, z7.h\n"
- "ld1b { z28.s }, p1/Z, [x21]\n"
- "mov z28.d, z28.d\n"
- "add z28.h, z28.h, z7.h\n"
- "sub x16, x16, x26\n"
- "cbz x26, 21f\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z30.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
+ "cbz x25, 21f\n"
"13:" // Unpadded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
- "addvl x25, SP, #6\n"
- "addvl x24, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
- "add x23, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "add x23, x16, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
"addvl x21, SP, #9\n"
- "subs x26, x26, #0x1\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
- ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- "add x17, x17, %x[ld_in_col]\n"
- "add x20, x17, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
- ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
- "ld1b { z16.s }, p1/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1b { z28.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
- "add z23.h, z23.h, z7.h\n"
- "ld1b { z24.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ "ld1b { z29.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- "ld1b { z18.s }, p1/Z, [x23]\n"
+ "ld1b { z9.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z18.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z25.s }, p1/Z, [x23]\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "ld1b { z8.s }, p1/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z9.h\n"
+ "ld1b { z17.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z8.h\n"
- "add z25.h, z25.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x23]\n"
+ "add z28.h, z28.h, z18.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z31.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "ld1b { z28.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1b { z13.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z28.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x23]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ "ld1b { z0.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- "ld1b { z28.s }, p1/Z, [x23]\n"
- "trn1 z27.h, z27.h, z28.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z29.h, z29.h, z18.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
"add x23, x23, %x[ld_in_row]\n"
- "add z27.h, z27.h, z7.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- "ld1b { z20.s }, p1/Z, [x23]\n"
- "mov z28.d, z20.d\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add z28.h, z28.h, z7.h\n"
- "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
- ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1b { z23.s }, p1/Z, [x17]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "st1b { z17.s }, p1, [x14]\n"
+ "trn1 z31.h, z31.h, z13.h\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "ld1h { z12.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z30.h, z30.h, z18.h\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "mov z1.d, z8.d\n"
+ "add z31.h, z31.h, z18.h\n"
+ "st1b { z24.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "add z0.h, z0.h, z18.h\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z1.h, z1.h, z18.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ ".inst 0xc17a1788 // sdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xc17b17a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781789 // sdot za.s[x8, 1], { z28.h-z31.h }, z8.h\n"
+ "ld1b { z11.s }, p1/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17c17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z12.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "ld1b { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xc17917a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z16.h\n"
- "add x9, x9, x27\n"
- "ld1b { z25.s }, p1/Z, [x20]\n"
+ "trn1 z11.h, z11.h, z17.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17417c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z4.h\n"
+ "ld1b { z13.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z23.h, z23.h, z7.h\n"
+ "trn1 z12.h, z12.h, z9.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z16.h\n"
- "add z24.h, z24.h, z7.h\n"
- "ld1b { z26.s }, p1/Z, [x20]\n"
+ "add z11.h, z11.h, z18.h\n"
+ "ld1b { z14.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "add z25.h, z25.h, z7.h\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z16.h\n"
- "add z26.h, z26.h, z7.h\n"
- "ld1b { z27.s }, p1/Z, [x20]\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "ld1b { z16.s }, p1/Z, [x20]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z12.h, z12.h, z18.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "add z27.h, z27.h, z7.h\n"
"ld1b { z16.s }, p1/Z, [x20]\n"
- "mov z28.d, z16.d\n"
- "add z28.h, z28.h, z7.h\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z14.h, z14.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z13.h, z13.h, z18.h\n"
+ "mov z16.d, z16.d\n"
+ "add z14.h, z14.h, z18.h\n"
+ "add z15.h, z15.h, z18.h\n"
+ "add z16.h, z16.h, z18.h\n"
"bgt 13b\n"
"b 21f\n"
"14:" // Padded
"cbz x22, 19f\n"
"cmp x22, #0x1\n"
- "sub x7, x7, x22\n"
+ "sub x17, x17, x22\n"
"beq 18f\n"
"cmp x22, #0x2\n"
"beq 17f\n"
@@ -644,686 +649,686 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"beq 16f\n"
"15:" // Padded: 4 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x17]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "trn1 z28.h, z28.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x21]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z30.s }, p0/Z, [x21]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z31.s }, p0/Z, [x21]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #12\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z29.h, z29.h, z18.h\n"
- "trn1 z30.h, z30.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
- "trn1 z31.h, z31.h, z16.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
- "ld1b { z20.s }, p0/Z, [x21]\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- "mov z0.d, z20.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
- "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"16:" // Padded: 3 priming loads
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x21, x17, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x21]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z26.s }, p0/Z, [x21]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x21]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x21]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x21]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z28.s }, p0/Z, [x21]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x21]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x20, SP, #9\n"
+ ".inst 0xc1711568 // sdot za.s[x8, 0], { z11.h-z14.h }, z1.h\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
"add x21, x21, %x[ld_in_row]\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "trn1 z27.h, z27.h, z17.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- "trn1 z28.h, z28.h, z16.h\n"
- ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
- "ld1b { z11.s }, p0/Z, [x21]\n"
- "add z11.h, p0/M, z11.h, z7.h\n"
- "mov z29.d, z11.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc17015a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z0.h\n"
"17:" // Padded: 2 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #12\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #6\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #12\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z1.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"18:" // Padded: 1 priming loads
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x17]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #9\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z25.h, z25.h, z17.h\n"
- "trn1 z26.h, z26.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "addvl x21, SP, #3\n"
- "trn1 z27.h, z27.h, z18.h\n"
- "trn1 z28.h, z28.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z29.h, z29.h, z16.h\n"
- ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "addvl x20, SP, #9\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
- ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
- "mov z30.d, z0.d\n"
- "add x17, x17, %x[ld_in_col]\n"
- "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
- ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
"ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
"19:" // Padded: 0 priming loads
- "cmp x7, #0x2\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "cmp x17, #0x2\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"blt 22f\n"
"mov x12, #0x0\n"
+ "add x21, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "sub x17, x17, #0x2\n"
+ "sub x15, x15, #0x1\n"
+ "lsr x20, x17, #0x1\n"
+ "cmp x20, x15\n"
+ "and x17, x17, #0x1\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "csel x25, x20, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z23.h, z23.h, z17.h\n"
- "trn1 z24.h, z24.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z19.s }, p0/Z, [x20]\n"
- "add z19.h, p0/M, z19.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z18.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z18.h, p0/M, z18.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "sub x7, x7, #0x2\n"
- "sub x16, x16, #0x1\n"
- "trn1 z25.h, z25.h, z19.h\n"
- "trn1 z26.h, z26.h, z18.h\n"
- "lsr x20, x7, #0x1\n"
- "cmp x20, x16\n"
- "trn1 z27.h, z27.h, z17.h\n"
- "mov z28.d, z16.d\n"
- "csel x25, x20, x16, LT\n"
- "add x17, x17, %x[ld_in_col]\n"
- "and x7, x7, #0x1\n"
- "sub x16, x16, x25\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"cbz x25, 21f\n"
"20:" // Padded: Main loop
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
- "addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x20, SP, #12\n"
"mov x12, #0x0\n"
+ "add x23, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
- "add x20, x17, %x[ld_in_row]\n"
"addvl x22, SP, #3\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x21, SP, #9\n"
"subs x25, x25, #0x1\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc178156a // sdot za.s[x8, 2], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z25.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ "add z25.h, p0/M, z25.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc179158a // sdot za.s[x8, 2], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa1402ac4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x22]\n"
+ "ld1b { z10.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17115aa // sdot za.s[x8, 2], { z13.h-z16.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ "ld1b { z26.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z10.h\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "add z26.h, p0/M, z26.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
- "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
- "trn1 z23.h, z23.h, z16.h\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1b { z27.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z27.h, p0/M, z27.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z3.s }, p0/Z, [x20]\n"
- "add z3.h, p0/M, z3.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z28.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z28.h, p0/M, z28.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z30.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ "add z29.h, p0/M, z29.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z29.s }, p0/Z, [x20]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
- "trn1 z24.h, z24.h, z1.h\n"
- "trn1 z25.h, z25.h, z3.h\n"
- "trn1 z26.h, z26.h, z30.h\n"
- ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
- "add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
- "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0xc1741728 // sdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1b { z15.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x0\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
- "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p0/Z, [x23]\n"
+ "trn1 z29.h, z29.h, z15.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
- "ld1b { z23.s }, p0/Z, [x17]\n"
- "add z23.h, p0/M, z23.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc17c1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z12.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "mov z30.d, z16.d\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z8.s }, p0/Z, [x20]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1791749 // sdot za.s[x8, 1], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
- "ld1b { z24.s }, p0/Z, [x20]\n"
- "mov z28.d, z20.d\n"
- "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z22.s }, p0/Z, [x20]\n"
- ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
"mov x12, #0x4\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z13.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z20.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z31.s }, p0/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
"add x20, x20, %x[ld_in_row]\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z1.s }, p0/Z, [x20]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- "trn1 z23.h, z23.h, z8.h\n"
- "trn1 z24.h, z24.h, z22.h\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "trn1 z25.h, z25.h, z28.h\n"
- "trn1 z26.h, z26.h, z20.h\n"
- "st1b { z19.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
- "trn1 z27.h, z27.h, z31.h\n"
- "mov z28.d, z1.d\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ "mov z16.d, z16.d\n"
"bgt 20b\n"
"21:" // Main loop tail
- ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
"addvl x24, SP, #6\n"
"addvl x23, SP, #12\n"
- ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
- ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
- "add x22, x17, %x[ld_in_row]\n"
"addvl x21, SP, #3\n"
- ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
- ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
"addvl x20, SP, #9\n"
- ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
- "ld1b { z29.s }, p0/Z, [x17]\n"
- "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc1711569 // sdot za.s[x8, 1], { z11.h-z14.h }, z1.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc1791589 // sdot za.s[x8, 1], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc1a3ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z3.s\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z8.s }, p0/Z, [x22]\n"
- "add z8.h, p0/M, z8.h, z7.h\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ "ld1b { z10.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a5aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
+ "add x8, x8, #0x1\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add z10.h, p0/M, z10.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
- "ld1b { z30.s }, p0/Z, [x22]\n"
- "add z30.h, p0/M, z30.h, z7.h\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
- "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "trn1 z11.h, z11.h, z10.h\n"
+ ".inst 0xc1a6ce78 // sclamp { z24.s-z27.s }, z19.s, z6.s\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
"mov x12, #0x4\n"
- "ld1b { z20.s }, p0/Z, [x22]\n"
- ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
- "add z20.h, p0/M, z20.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z25.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z26.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
- "trn1 z29.h, z29.h, z8.h\n"
- "ld1b { z31.s }, p0/Z, [x22]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
+ "st1b { z27.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z25.s }, p0/Z, [x22]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z0.s }, p0/Z, [x22]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z17.s }, p0/Z, [x22]\n"
"mov x12, #0x8\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z1.s }, p0/Z, [x22]\n"
- "add z1.h, p0/M, z1.h, z7.h\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z28.s }, p0/Z, [x22]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "trn1 z30.h, z30.h, z20.h\n"
- "trn1 z31.h, z31.h, z25.h\n"
- "trn1 z0.h, z0.h, z17.h\n"
- ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1781568 // sdot za.s[x8, 0], { z11.h-z14.h }, z8.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
"add x22, x22, %x[ld_in_row]\n"
- ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
- "add x8, x8, #0x1\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z1.h, z1.h, z28.h\n"
- ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
- "ld1b { z22.s }, p0/Z, [x22]\n"
- ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
- "add z22.h, p0/M, z22.h, z7.h\n"
- ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
- ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
- "add x17, x17, %x[ld_in_col]\n"
- ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
- ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
- "mov z2.d, z22.d\n"
- "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
- ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
- ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
- "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
- ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
- "st1b { z24.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z25.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
- "st1b { z26.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
- "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
- "st1b { z27.s }, p1, [x9]\n"
- "add x9, x9, x27\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc1791588 // sdot za.s[x8, 0], { z12.h-z15.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1701569 // sdot za.s[x8, 1], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1711589 // sdot za.s[x8, 1], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xa1402be2 // ld1h { z2.h, z10.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc17015a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z0.h\n"
+ "ld1h { z8.h }, p2/Z, [SP, #2, MUL VL]\n"
"22:" // Main loop skip tail
- "cbz x7, 23f\n" // Skip remainder inputs
+ "cbz x17, 23f\n" // Skip remainder inputs
"mov x12, #0x0\n"
+ "add x22, x16, %x[ld_in_row]\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z24.s }, p0/Z, [x17]\n"
- "add z24.h, p0/M, z24.h, z7.h\n"
- "add x20, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #6\n"
+ "addvl x20, SP, #12\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z11.s }, p0/Z, [x16]\n"
+ "add z11.h, p0/M, z11.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z25.s }, p0/Z, [x20]\n"
- "add z25.h, p0/M, z25.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z11.h, z11.h, z16.h\n"
+ "add z12.h, p0/M, z12.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
"mov x12, #0x4\n"
- "add x20, x20, %x[ld_in_row]\n"
- "trn1 z24.h, z24.h, z17.h\n"
- "trn1 z25.h, z25.h, z16.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z26.s }, p0/Z, [x20]\n"
- "add z26.h, p0/M, z26.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z12.h, z12.h, z16.h\n"
+ "add z13.h, p0/M, z13.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z17.s }, p0/Z, [x20]\n"
- "add z17.h, p0/M, z17.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "ld1b { z27.s }, p0/Z, [x20]\n"
- "add z27.h, p0/M, z27.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z13.h, z13.h, z16.h\n"
+ "add z14.h, p0/M, z14.h, z18.h\n"
".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
- "ld1b { z16.s }, p0/Z, [x20]\n"
"mov x12, #0x8\n"
- "add z16.h, p0/M, z16.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
- "ld1b { z28.s }, p0/Z, [x20]\n"
- "add z28.h, p0/M, z28.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z14.h, z14.h, z16.h\n"
+ "add z15.h, p0/M, z15.h, z18.h\n"
".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
- "ld1b { z31.s }, p0/Z, [x20]\n"
- "add z31.h, p0/M, z31.h, z7.h\n"
- "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721568 // sdot za.s[x8, 0], { z11.h-z14.h }, z2.h\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z17.h, p0/M, z17.h, z18.h\n"
".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
- "trn1 z26.h, z26.h, z17.h\n"
- "trn1 z27.h, z27.h, z16.h\n"
- "ld1b { z0.s }, p0/Z, [x20]\n"
- "add z0.h, p0/M, z0.h, z7.h\n"
- "trn1 z28.h, z28.h, z31.h\n"
- "addvl x21, SP, #6\n"
- ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
- "mov z29.d, z0.d\n"
- "addvl x20, SP, #12\n"
- "sub x16, x16, #0x1\n"
- ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
- ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
- ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ "trn1 z15.h, z15.h, z17.h\n"
+ "add z16.h, p0/M, z16.h, z18.h\n"
+ ".inst 0xc17a1588 // sdot za.s[x8, 0], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "mov z16.d, z16.d\n"
+ ".inst 0xc1721569 // sdot za.s[x8, 1], { z11.h-z14.h }, z2.h\n"
+ ".inst 0xc17815a8 // sdot za.s[x8, 0], { z13.h-z16.h }, z8.h\n"
"ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
- ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
- ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
- ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
- ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
- ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
- ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
- ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
- ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
- ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
- "st1b { z16.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
- "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
- "st1b { z17.s }, p1, [x14]\n"
- "add x14, x14, x11\n"
- ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc17a1589 // sdot za.s[x8, 1], { z12.h-z15.h }, z10.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc170156a // sdot za.s[x8, 2], { z11.h-z14.h }, z0.h\n"
+ ".inst 0xc17215a9 // sdot za.s[x8, 1], { z13.h-z16.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ ".inst 0xc171158a // sdot za.s[x8, 2], { z12.h-z15.h }, z1.h\n"
+ ".inst 0xc1a5aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc17015aa // sdot za.s[x8, 2], { z13.h-z16.h }, z0.h\n"
"add x8, x8, #0x1\n"
- "st1b { z18.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z19.s }, p1, [x9]\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a7ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ce7c // sclamp { z28.s-z31.s }, z19.s, z6.s\n"
+ "st1b { z28.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z29.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z30.s }, p1, [x9]\n"
"add x9, x9, x27\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "st1b { z31.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"23:" // Tail input: End
- "cbz x16, 25f\n"
+ "cbz x15, 25f\n"
"24:" // Right padding loop
- ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
- ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
"add x8, x8, #0x1\n"
- ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
- "subs x16, x16, #0x1\n"
- ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
- ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
- ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
- "st1b { z28.s }, p1, [x15]\n"
- "add x15, x15, x13\n"
- "st1b { z29.s }, p1, [x14]\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc0040e84 // mova za.d[x8, #4], { z20.d-z23.d }\n"
+ ".inst 0xc1a3ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z3.s\n"
+ ".inst 0xc1a5aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+ ".inst 0xc1a7ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ ".inst 0xc1a6ce68 // sclamp { z8.s-z11.s }, z19.s, z6.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
"add x14, x14, x11\n"
- "st1b { z30.s }, p1, [x10]\n"
- "add x10, x10, x28\n"
- "st1b { z31.s }, p1, [x9]\n"
+ "st1b { z9.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z10.s }, p1, [x9]\n"
"add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
"bgt 24b\n"
"25:" // End
"ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x7\n"
+ "whilelt p1.s, x7, x6\n"
"incw x20, ALL, MUL #16\n"
"incw x20, ALL, MUL #9\n"
"str x20, [%x[args], %[offsetof_Args_weights]]\n"
"ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
- "incw x6\n"
- "whilelt p1.s, x6, x5\n"
"ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
"add x20, x20, x21\n"
"str x20, [%x[args], %[offsetof_Args_inptr]]\n"
@@ -1342,9 +1347,11 @@ void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
"b.any 1b\n"
"addvl SP, SP, #15\n"
".inst 0xd503467f // SMSTOP\n"
+ "ldr x20, [SP, #0x0]\n"
+ "mov SP, x20\n"
:
: [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index d807856ccb..dec7a99425 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,84 +88,84 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x10, #0x0\n"
- "mov x14, #0x0\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
"1:" // Tile loop
- "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
"mov x25, #0x2\n"
- "mov x24, #0x2\n"
- "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
- "cnth x11\n"
- "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "cnth x15\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
- "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1h { z27.h }, p3/Z, [x10]\n"
- "add x27, x13, x13\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
- "add x26, x9, x23, LSL #1\n"
- "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
- "add x25, x26, x23, LSL #1\n"
- "add x24, x27, x13\n"
- "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "cmp x11, %x[n_channels]\n"
- "add x23, x25, x23, LSL #1\n"
- "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "add x22, x28, x22, LSL #1\n"
- "mov x21, #0x0\n"
- "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
- "sub x20, XZR, x11\n"
- "ld1h { z10.h }, p2/Z, [x9]\n"
- "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
- "addvl x10, x10, #-6\n"
- "ld1h { z12.h }, p2/Z, [x26, x27, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
+ "mov x12, #0x0\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x22, x17, x24\n" // offset = tile_i * ld_input_row
+ "mul x21, x17, x23\n" // offset = tile_i * ld_output_row
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1rh { z27.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x28, x14, x14\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x20, XZR, x15\n"
+ "madd x22, x16, x14, x22\n" // offset += tile_j * ld_input_col
+ "ld1h { z25.h }, p3/Z, [x11]\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "add x27, x28, x14\n"
+ "madd x21, x16, x13, x21\n" // offset += tile_j * ld_output_col
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x10, x10, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x26, x10, x24, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x10]\n"
+ "ld1h { z11.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "add x25, x26, x24, LSL #1\n"
+ "add x9, x9, x21, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x25, x24, LSL #1\n"
+ "ld1h { z9.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x26, x28, LSL #1]\n"
+ "add x23, x9, x23, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1h { z13.h }, p2/Z, [x25, x14, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
- "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
- "whilelt p1.h, x11, %x[n_channels]\n"
- "inch x21\n"
- "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z18.h }, p2/Z, [x23]\n"
- "inch x11\n"
+ "movprfx z24, z25\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z25\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "whilelt p1.h, x15, %x[n_channels]\n"
+ "inch x12\n"
+ "movprfx z22, z25\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z25\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x24]\n"
+ "inch x15\n"
+ "mov p0.b, p2.b\n"
+ "ld1h { z25.h }, p3/Z, [x11]\n"
+ "inch x20\n"
"fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x28, LSL #1]\n"
"fmla z23.h, p3/M, z2.h, z11.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
"fmla z22.h, p3/M, z2.h, z12.h\n"
"fmla z21.h, p3/M, z1.h, z12.h\n"
- "mov p0.b, p2.b\n"
- "ld1h { z27.h }, p3/Z, [x10]\n"
"fmla z24.h, p3/M, z5.h, z12.h\n"
"fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
- "inch x20\n"
+ "ld1h { z16.h }, p2/Z, [x10, x14, LSL #1]\n"
"fmla z22.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x28, LSL #1]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.h, p3/M, z3.h, z13.h\n"
- "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
- "addvl x9, x9, #1\n"
"fmla z24.h, p3/M, z7.h, z13.h\n"
"fmla z23.h, p3/M, z6.h, z13.h\n"
"fmla z22.h, p3/M, z4.h, z13.h\n"
@@ -173,102 +173,102 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1h { z17.h }, p2/Z, [x26]\n"
"fmla z24.h, p3/M, z1.h, z16.h\n"
"fmla z23.h, p3/M, z0.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x27, LSL #1]\n"
"addvl x26, x26, #1\n"
"fmla z22.h, p3/M, z5.h, z20.h\n"
"fmla z21.h, p3/M, z4.h, z20.h\n"
- "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
"fmla z24.h, p3/M, z2.h, z18.h\n"
"fmla z23.h, p3/M, z1.h, z18.h\n"
"ld1h { z19.h }, p2/Z, [x25]\n"
- "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
"fmla z22.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
"fmla z21.h, p3/M, z2.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z24.h, p3/M, z8.h, z20.h\n"
"fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x25, x27, LSL #1]\n"
"addvl x25, x25, #1\n"
"fmla z22.h, p3/M, z3.h, z19.h\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
- "ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
"fmla z24.h, p3/M, z3.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1h { z13.h }, p1/Z, [x25, x14, LSL #1]\n"
"fmla z23.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "whilelt p2.h, x12, %x[n_channels]\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
"fmla z22.h, p3/M, z7.h, z17.h\n"
"fmla z21.h, p3/M, z6.h, z17.h\n"
- "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x10, x27, LSL #1]\n"
"fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "ld1h { z9.h }, p1/Z, [x26, x14, LSL #1]\n"
"fmla z23.h, p3/M, z8.h, z18.h\n"
- "fmax z24.h, p3/M, z24.h, z26.h\n"
- "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "ld1h { z10.h }, p1/Z, [x10]\n"
"fmla z22.h, p3/M, z8.h, z16.h\n"
"fmla z21.h, p3/M, z7.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z26.h\n"
- "fmax z21.h, p3/M, z21.h, z26.h\n"
- "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
- "cmp x11, %x[n_channels]\n"
- "fmin z24.h, p3/M, z24.h, z25.h\n"
- "ld1h { z10.h }, p1/Z, [x9]\n"
- "ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z25.h\n"
- "fmin z22.h, p3/M, z22.h, z25.h\n"
- "ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
- "st1h { z24.h }, p0, [x28]\n"
- "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "ld1h { z12.h }, p1/Z, [x26, x28, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z27.h\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "fmax z23.h, p3/M, z23.h, z27.h\n"
+ "fmin z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z27.h\n"
+ "fmax z21.h, p3/M, z21.h, z27.h\n"
+ "fmin z23.h, p3/M, z23.h, z26.h\n"
+ "fmin z22.h, p3/M, z22.h, z26.h\n"
+ "st1h { z24.h }, p0, [x9]\n"
+ "fmin z21.h, p3/M, z21.h, z26.h\n"
+ "st1h { z23.h }, p0, [x9, x13, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z21.h }, p0, [x23, x13, LSL #1]\n"
"addvl x23, x23, #1\n"
- "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1h { z22.h }, p0, [x22]\n"
- "addvl x28, x28, #1\n"
- "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "addvl x10, x10, #-6\n"
- "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
- "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z18.h }, p2/Z, [x23]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z24, z25\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z25\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z25\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z25\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x24]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x16, x16, #0x1\n"
+ "add x20, x17, #0x1\n"
"fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x28, LSL #1]\n"
"fmla z23.h, p3/M, z2.h, z11.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "cmp x16, x22\n"
"fmla z22.h, p3/M, z2.h, z12.h\n"
"fmla z21.h, p3/M, z1.h, z12.h\n"
- "add x14, x14, #0x1\n"
- "cmp x14, x20\n"
+ "csel x17, x17, x20, LT\n"
+ "csel x16, x16, XZR, LT\n"
"fmla z24.h, p3/M, z5.h, z12.h\n"
"fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
- "add x21, x10, #0x1\n"
+ "ld1h { z16.h }, p2/Z, [x10, x14, LSL #1]\n"
"fmla z22.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x28, LSL #1]\n"
"fmla z21.h, p3/M, z3.h, z13.h\n"
- "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "cmp x17, x21\n"
"fmla z24.h, p3/M, z7.h, z13.h\n"
"fmla z23.h, p3/M, z6.h, z13.h\n"
- "csel x10, x10, x21, LT\n"
- "mov p0.b, p2.b\n"
"fmla z22.h, p3/M, z4.h, z13.h\n"
"fmla z21.h, p3/M, z8.h, z17.h\n"
"ld1h { z17.h }, p2/Z, [x26]\n"
- "csel x14, x14, XZR, LT\n"
"fmla z24.h, p3/M, z1.h, z16.h\n"
"fmla z23.h, p3/M, z0.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
- "cmp x10, x20\n"
+ "ld1h { z16.h }, p2/Z, [x26, x27, LSL #1]\n"
"fmla z22.h, p3/M, z5.h, z20.h\n"
"fmla z21.h, p3/M, z4.h, z20.h\n"
"fmla z24.h, p3/M, z2.h, z18.h\n"
@@ -278,35 +278,35 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmla z21.h, p3/M, z2.h, z16.h\n"
"fmla z24.h, p3/M, z8.h, z20.h\n"
"fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x25, x27, LSL #1]\n"
"fmla z22.h, p3/M, z3.h, z19.h\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
"fmla z24.h, p3/M, z3.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x14, LSL #1]\n"
"fmla z23.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x28, LSL #1]\n"
"fmla z22.h, p3/M, z7.h, z17.h\n"
"fmla z21.h, p3/M, z6.h, z17.h\n"
"fmla z24.h, p3/M, z6.h, z19.h\n"
"fmla z23.h, p3/M, z8.h, z18.h\n"
- "fmax z24.h, p3/M, z24.h, z26.h\n"
- "fmax z23.h, p3/M, z23.h, z26.h\n"
"fmla z22.h, p3/M, z8.h, z16.h\n"
"fmla z21.h, p3/M, z7.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z26.h\n"
- "fmax z21.h, p3/M, z21.h, z26.h\n"
- "fmin z24.h, p3/M, z24.h, z25.h\n"
- "fmin z23.h, p3/M, z23.h, z25.h\n"
- "st1h { z24.h }, p0, [x28]\n"
- "fmin z22.h, p3/M, z22.h, z25.h\n"
- "fmin z21.h, p3/M, z21.h, z25.h\n"
- "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
- "st1h { z22.h }, p0, [x22]\n"
- "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z27.h\n"
+ "fmax z23.h, p3/M, z23.h, z27.h\n"
+ "fmin z24.h, p3/M, z24.h, z26.h\n"
+ "fmin z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z27.h\n"
+ "fmax z21.h, p3/M, z21.h, z27.h\n"
+ "st1h { z24.h }, p0, [x9]\n"
+ "st1h { z23.h }, p0, [x9, x13, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z26.h\n"
+ "fmin z21.h, p3/M, z21.h, z26.h\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z21.h }, p0, [x23, x13, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 90982b6990..ff85bc51c7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -83,210 +83,210 @@ void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"cnth x14\n"
- "ldp x13, x12, [x20, #0x0]\n"
- "ldp x11, x10, [x20, #0x10]\n"
- "mov x9, #0x0\n"
+ "mov x13, #0x0\n"
+ "ldr x24, [x15, #0x20]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z20.h }, p3/Z, [x16]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "ld1h { z27.h }, p3/Z, [x16]\n"
"ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "cmp x14, %x[n_channels]\n"
"ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
"ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
"sub x28, XZR, x14\n"
"ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
"ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
"ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
"addvl x16, x16, #-6\n"
- "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x13, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x24, x13, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
- "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
"ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x22, [x15, #0x38]\n"
+ "ldr x25, [x15, #0x30]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ldr x24, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x23, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "inch x28\n"
+ "ld1h { z18.h }, p2/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "mov p0.b, p2.b\n"
"fmla z24.h, p3/M, z0.h, z10.h\n"
"fmla z23.h, p3/M, z2.h, z11.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x48]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x20, x13, LSL #1]\n"
"fmla z22.h, p3/M, z2.h, z12.h\n"
"fmla z21.h, p3/M, z1.h, z12.h\n"
- "ldr x20, [x15, #0x40]\n"
- "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "ld1h { z27.h }, p3/Z, [x16]\n"
"fmla z24.h, p3/M, z5.h, z12.h\n"
"fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x22, [x15, #0x50]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x13, LSL #1]\n"
+ "ldr x25, [x15, #0x78]\n"
"fmla z22.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
"fmla z21.h, p3/M, z3.h, z13.h\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x58]\n"
"fmla z24.h, p3/M, z7.h, z13.h\n"
"fmla z23.h, p3/M, z6.h, z13.h\n"
- "ldr x20, [x15, #0x60]\n"
- "ldr x27, [x15, #0x68]\n"
"fmla z22.h, p3/M, z4.h, z13.h\n"
"fmla z21.h, p3/M, z8.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x26, [x15, #0x70]\n"
+ "ld1h { z17.h }, p2/Z, [x22, x13, LSL #1]\n"
"fmla z24.h, p3/M, z1.h, z16.h\n"
"fmla z23.h, p3/M, z0.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x25, [x15, #0x78]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x13, LSL #1]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
"fmla z22.h, p3/M, z5.h, z20.h\n"
"fmla z21.h, p3/M, z4.h, z20.h\n"
- "whilelt p1.h, x14, %x[n_channels]\n"
- "ldp x24, x23, [x15, #0x0]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
"fmla z24.h, p3/M, z2.h, z18.h\n"
"fmla z23.h, p3/M, z1.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldp x22, x21, [x15, #0x10]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
"fmla z22.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
"fmla z21.h, p3/M, z2.h, z16.h\n"
- "ldr x20, [x15, #0x20]\n"
- "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
"fmla z24.h, p3/M, z8.h, z20.h\n"
+ "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
"fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
- "inch x28\n"
+ "ld1h { z18.h }, p2/Z, [x27, x13, LSL #1]\n"
"fmla z22.h, p3/M, z3.h, z19.h\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
- "mov p0.b, p2.b\n"
- "ld1h { z20.h }, p3/Z, [x16]\n"
"fmla z24.h, p3/M, z3.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
"fmla z23.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x25, x13, LSL #1]\n"
+ "inch x13\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
"fmla z22.h, p3/M, z7.h, z17.h\n"
"fmla z21.h, p3/M, z6.h, z17.h\n"
- "inch x9\n"
"ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
"fmla z24.h, p3/M, z6.h, z19.h\n"
- "fmla z23.h, p3/M, z8.h, z18.h\n"
"ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
"ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "whilelt p2.h, x13, %x[n_channels]\n"
"fmla z22.h, p3/M, z8.h, z16.h\n"
"fmla z21.h, p3/M, z7.h, z16.h\n"
"ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
"inch x14\n"
"fmax z24.h, p3/M, z24.h, z26.h\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
"fmax z23.h, p3/M, z23.h, z26.h\n"
- "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
"fmax z22.h, p3/M, z22.h, z26.h\n"
"fmax z21.h, p3/M, z21.h, z26.h\n"
- "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "whilelt p2.h, x9, %x[n_channels]\n"
"cmp x14, %x[n_channels]\n"
- "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
"fmin z24.h, p3/M, z24.h, z25.h\n"
- "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
"fmin z23.h, p3/M, z23.h, z25.h\n"
"fmin z22.h, p3/M, z22.h, z25.h\n"
- "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
"fmin z21.h, p3/M, z21.h, z25.h\n"
- "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z24.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z23.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x9, x28, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
- "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x22, [x15, #0x38]\n"
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ldr x27, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x26, [x15, #0x40]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "ld1h { z18.h }, p2/Z, [x22, x13, LSL #1]\n"
+ "ldr x24, [x15, #0x58]\n"
"fmla z24.h, p3/M, z0.h, z10.h\n"
"fmla z23.h, p3/M, z2.h, z11.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x48]\n"
+ "ld1h { z17.h }, p2/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x20, x13, LSL #1]\n"
"fmla z22.h, p3/M, z2.h, z12.h\n"
"fmla z21.h, p3/M, z1.h, z12.h\n"
- "ldr x20, [x15, #0x40]\n"
- "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla z24.h, p3/M, z5.h, z12.h\n"
"fmla z23.h, p3/M, z4.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x21, [x15, #0x50]\n"
+ "ld1h { z16.h }, p2/Z, [x27, x13, LSL #1]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla z22.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x13, LSL #1]\n"
"fmla z21.h, p3/M, z3.h, z13.h\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x20, [x15, #0x58]\n"
"fmla z24.h, p3/M, z7.h, z13.h\n"
"fmla z23.h, p3/M, z6.h, z13.h\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
"fmla z22.h, p3/M, z4.h, z13.h\n"
"fmla z21.h, p3/M, z8.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ldr x21, [x15, #0x70]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x13, LSL #1]\n"
"fmla z24.h, p3/M, z1.h, z16.h\n"
"fmla z23.h, p3/M, z0.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x20, [x15, #0x78]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x13, LSL #1]\n"
"fmla z22.h, p3/M, z5.h, z20.h\n"
"fmla z21.h, p3/M, z4.h, z20.h\n"
- "inch x28\n"
- "mov p0.b, p2.b\n"
"fmla z24.h, p3/M, z2.h, z18.h\n"
"fmla z23.h, p3/M, z1.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x23, x13, LSL #1]\n"
"fmla z22.h, p3/M, z0.h, z17.h\n"
"fmla z21.h, p3/M, z2.h, z16.h\n"
"fmla z24.h, p3/M, z8.h, z20.h\n"
"fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x22, x13, LSL #1]\n"
"fmla z22.h, p3/M, z3.h, z19.h\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
"fmla z24.h, p3/M, z3.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x21, x13, LSL #1]\n"
"fmla z23.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x13, LSL #1]\n"
"fmla z22.h, p3/M, z7.h, z17.h\n"
"fmla z21.h, p3/M, z6.h, z17.h\n"
"fmla z24.h, p3/M, z6.h, z19.h\n"
"fmla z23.h, p3/M, z8.h, z18.h\n"
- "fmax z24.h, p3/M, z24.h, z26.h\n"
- "fmax z23.h, p3/M, z23.h, z26.h\n"
"fmla z22.h, p3/M, z8.h, z16.h\n"
"fmla z21.h, p3/M, z7.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z26.h\n"
- "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
"fmin z24.h, p3/M, z24.h, z25.h\n"
"fmin z23.h, p3/M, z23.h, z25.h\n"
- "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "st1h { z24.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z23.h }, p0, [x11, x28, LSL #1]\n"
"fmin z22.h, p3/M, z22.h, z25.h\n"
"fmin z21.h, p3/M, z21.h, z25.h\n"
- "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x9, x28, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index a22ab39d6f..0b903917bc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,369 +88,369 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x13, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x5, #0x0\n"
+ "mov x6, #0x0\n"
"1:" // Tile loop
- "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x3\n"
"mov x25, #0x3\n"
- "mov x24, #0x3\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
- "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x15\n"
- "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
- "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x12, x17, x17\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x10, x14, x23, LSL #1\n"
- "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
- "add x9, x10, x23, LSL #1\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cnth x8\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z14.h }, p3/Z, [x13]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
- "add x28, x9, x23, LSL #1\n"
- "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
- "add x27, x12, x17\n"
- "add x11, x11, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
- "add x26, x28, x23, LSL #1\n"
- "add x25, x27, x17\n"
- "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
- "addvl x13, x13, #16\n"
- "add x24, x11, x21, LSL #1\n"
- "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "cmp x15, %x[n_channels]\n"
- "add x23, x24, x21, LSL #1\n"
- "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
- "add x22, x16, x16\n"
- "mov x21, #0x0\n"
- "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
- "sub x20, XZR, x15\n"
- "ld1h { z10.h }, p2/Z, [x14]\n"
- "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n"
- "addvl x13, x13, #-6\n"
- "ld1h { z12.h }, p2/Z, [x26]\n"
- "ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
+ "mov x16, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x5, x24\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x12, x7, x7\n"
+ "cmp x8, %x[n_channels]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x21, x5, x23\n" // offset = tile_i * ld_output_row
+ "add x11, x12, x7\n"
+ "add x10, x17, x17\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "madd x22, x6, x7, x22\n" // offset += tile_j * ld_input_col
+ "ld1h { z31.h }, p3/Z, [x14]\n"
+ "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n"
+ "add x9, x11, x7\n"
+ "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n"
+ "sub x20, XZR, x8\n"
+ "madd x21, x6, x17, x21\n" // offset += tile_j * ld_output_col
+ "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x15, x15, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x28, x15, x24, LSL #1\n"
+ "add x27, x28, x24, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x15]\n"
+ "ld1h { z11.h }, p2/Z, [x15, x9, LSL #1]\n"
+ "add x26, x27, x24, LSL #1\n"
+ "add x13, x13, x21, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x25, x26, x24, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n"
+ "add x24, x13, x23, LSL #1\n"
+ "ld1h { z9.h }, p2/Z, [x27, x12, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x25]\n"
+ "addvl x14, x14, #-6\n"
+ "add x23, x24, x23, LSL #1\n"
+ "ld1h { z13.h }, p2/Z, [x28, x12, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x15, %x[n_channels]\n"
- "inch x21\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "inch x15\n"
+ "movprfx z30, z31\n fmla z30.h, p3/M, z7.h, z9.h\n"
+ "movprfx z29, z31\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x8, %x[n_channels]\n"
+ "inch x16\n"
+ "movprfx z28, z31\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z31\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "inch x8\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z31\n fmla z26.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z31\n fmla z25.h, p3/M, z3.h, z9.h\n"
"inch x20\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "fmla z29.h, p3/M, z6.h, z18.h\n"
- "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z27.h, p3/M, z3.h, z13.h\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z25.h, p3/M, z1.h, z13.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z17.h\n"
- "ld1h { z14.h }, p3/Z, [x13]\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z18.h\n"
- "fmla z20.h, p3/M, z0.h, z18.h\n"
- "fmla z26.h, p3/M, z4.h, z18.h\n"
- "fmla z25.h, p3/M, z3.h, z18.h\n"
- "fmla z22.h, p3/M, z1.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x10]\n"
- "fmla z29.h, p3/M, z2.h, z16.h\n"
- "fmla z27.h, p3/M, z1.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x28]\n"
- "fmla z24.h, p3/M, z4.h, z23.h\n"
- "fmla z28.h, p3/M, z1.h, z17.h\n"
- "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z20.h, p3/M, z2.h, z23.h\n"
- "fmla z21.h, p3/M, z1.h, z23.h\n"
- "fmla z29.h, p3/M, z8.h, z23.h\n"
- "fmla z27.h, p3/M, z7.h, z23.h\n"
- "fmla z25.h, p3/M, z5.h, z23.h\n"
- "fmla z26.h, p3/M, z0.h, z19.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "fmla z24.h, p3/M, z2.h, z16.h\n"
- "fmla z20.h, p3/M, z4.h, z17.h\n"
- "fmla z21.h, p3/M, z3.h, z17.h\n"
- "fmla z28.h, p3/M, z3.h, z19.h\n"
- "fmla z27.h, p3/M, z5.h, z16.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z18.h\n"
- "fmla z25.h, p3/M, z7.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z22.h, p3/M, z5.h, z17.h\n"
- "fmla z24.h, p3/M, z6.h, z17.h\n"
- "fmla z21.h, p3/M, z5.h, z19.h\n"
- "fmla z20.h, p3/M, z6.h, z16.h\n"
- "fmla z26.h, p3/M, z8.h, z17.h\n"
- "fmla z22.h, p3/M, z7.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z18.h\n"
- "fmla z25.h, p3/M, z0.h, z18.h\n"
- "fmla z24.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z17.h\n"
- "addvl x10, x10, #1\n"
- "fmla z21.h, p3/M, z7.h, z17.h\n"
- "fmla z28.h, p3/M, z4.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z18.h\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
- "addvl x28, x28, #1\n"
- "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "movprfx z24, z31\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z23, z31\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z22.h }, p2/Z, [x27, x11, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z13.h\n"
+ "fmla z26.h, p3/M, z1.h, z13.h\n"
+ "fmla z25.h, p3/M, z0.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "movprfx z21, z31\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z31.h }, p3/Z, [x14]\n"
+ "fmla z30.h, p3/M, z6.h, z17.h\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x11, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z0.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z30.h, p3/M, z0.h, z18.h\n"
+ "fmla z29.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x28]\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "fmla z25.h, p3/M, z4.h, z22.h\n"
+ "fmla z23.h, p3/M, z1.h, z22.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "fmla z21.h, p3/M, z2.h, z22.h\n"
+ "fmla z27.h, p3/M, z0.h, z20.h\n"
+ "fmla z30.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "fmla z29.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z22.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
"fmla z25.h, p3/M, z2.h, z16.h\n"
- "fmla z24.h, p3/M, z1.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "addvl x14, x14, #1\n"
- "fmla z20.h, p3/M, z3.h, z17.h\n"
- "fmla z21.h, p3/M, z4.h, z19.h\n"
- "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
- "ld1h { z10.h }, p1/Z, [x14]\n"
- "fmla z26.h, p3/M, z7.h, z17.h\n"
- "fmla z25.h, p3/M, z6.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x9]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "fmax z29.h, p3/M, z29.h, z31.h\n"
- "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z27.h, p3/M, z0.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z24.h, p3/M, z7.h, z19.h\n"
- "addvl x9, x9, #1\n"
- "fmla z20.h, p3/M, z5.h, z19.h\n"
- "fmla z22.h, p3/M, z0.h, z18.h\n"
- "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z29.h, p3/M, z29.h, z30.h\n"
- "fmla z21.h, p3/M, z2.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z25.h, p3/M, z25.h, z31.h\n"
- "fmla z28.h, p3/M, z6.h, z18.h\n"
- "fmla z26.h, p3/M, z3.h, z18.h\n"
- "fmax z28.h, p3/M, z28.h, z31.h\n"
- "fmax z26.h, p3/M, z26.h, z31.h\n"
- "fmla z27.h, p3/M, z8.h, z17.h\n"
- "fmla z24.h, p3/M, z5.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z31.h\n"
- "fmax z24.h, p3/M, z24.h, z31.h\n"
- "fmla z22.h, p3/M, z8.h, z16.h\n"
- "fmla z20.h, p3/M, z7.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z31.h\n"
- "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z27.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x7, LSL #1]\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z23.h, p3/M, z3.h, z18.h\n"
+ "fmla z26.h, p3/M, z7.h, z18.h\n"
+ "fmla z24.h, p3/M, z5.h, z18.h\n"
+ "fmla z25.h, p3/M, z6.h, z18.h\n"
+ "fmla z27.h, p3/M, z8.h, z18.h\n"
+ "fmla z30.h, p3/M, z3.h, z19.h\n"
"fmla z21.h, p3/M, z6.h, z16.h\n"
- "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z5.h, z17.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x11, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x11, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x7, LSL #1]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "fmla z23.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x11, LSL #1]\n"
"addvl x26, x26, #1\n"
- "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
- "addvl x13, x13, #16\n"
- "fmin z28.h, p3/M, z28.h, z30.h\n"
- "ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z30.h\n"
- "fmin z26.h, p3/M, z26.h, z30.h\n"
- "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
- "ld1h { z12.h }, p1/Z, [x26]\n"
- "fmin z25.h, p3/M, z25.h, z30.h\n"
- "fmin z24.h, p3/M, z24.h, z30.h\n"
- "ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
- "st1h { z28.h }, p0, [x11]\n"
- "fmin z22.h, p3/M, z22.h, z30.h\n"
- "fmin z20.h, p3/M, z20.h, z30.h\n"
- "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z21.h, p3/M, z21.h, z30.h\n"
- "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
- "addvl x11, x11, #1\n"
- "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
- "st1h { z26.h }, p0, [x24]\n"
- "addvl x13, x13, #-6\n"
- "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x12, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z17.h\n"
+ "addvl x15, x15, #1\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z27.h, p3/M, z7.h, z17.h\n"
+ "fmla z23.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n"
+ "fmla z26.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x27]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z10.h }, p1/Z, [x15]\n"
+ "fmla z28.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z19.h\n"
+ "addvl x27, x27, #1\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z24.h, p3/M, z0.h, z18.h\n"
+ "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x12, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z18.h\n"
+ "addvl x25, x25, #1\n"
+ "fmla z23.h, p3/M, z2.h, z17.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "whilelt p2.h, x16, %x[n_channels]\n"
+ "cmp x8, %x[n_channels]\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "ld1h { z9.h }, p1/Z, [x27, x12, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x15, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "ld1h { z12.h }, p1/Z, [x25]\n"
+ "ld1h { z13.h }, p1/Z, [x28, x12, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z29.h }, p0, [x13]\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z30.h }, p0, [x13, x17, LSL #1]\n"
+ "st1h { z28.h }, p0, [x13, x10, LSL #1]\n"
+ "addvl x13, x13, #1\n"
+ "addvl x14, x14, #-6\n"
+ "st1h { z27.h }, p0, [x24]\n"
+ "st1h { z26.h }, p0, [x24, x17, LSL #1]\n"
+ "st1h { z25.h }, p0, [x24, x10, LSL #1]\n"
"addvl x24, x24, #1\n"
- "st1h { z22.h }, p0, [x23]\n"
- "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23]\n"
+ "st1h { z21.h }, p0, [x23, x17, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x10, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z4.h, z13.h\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x8, x8, #0x1\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
- "cmp x8, x20\n"
- "add x21, x13, #0x1\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "csel x13, x13, x21, LT\n"
- "fmla z29.h, p3/M, z6.h, z18.h\n"
- "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "movprfx z30, z31\n fmla z30.h, p3/M, z7.h, z9.h\n"
+ "movprfx z29, z31\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z28, z31\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z31\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "movprfx z26, z31\n fmla z26.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z31\n fmla z25.h, p3/M, z3.h, z9.h\n"
"mov p0.b, p2.b\n"
- "csel x8, x8, XZR, LT\n"
- "fmla z28.h, p3/M, z5.h, z13.h\n"
- "fmla z27.h, p3/M, z3.h, z13.h\n"
- "cmp x13, x20\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z25.h, p3/M, z1.h, z13.h\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
- "fmla z22.h, p3/M, z6.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
- "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
- "fmla z28.h, p3/M, z7.h, z18.h\n"
- "fmla z20.h, p3/M, z0.h, z18.h\n"
- "fmla z26.h, p3/M, z4.h, z18.h\n"
- "fmla z25.h, p3/M, z3.h, z18.h\n"
- "fmla z22.h, p3/M, z1.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x10]\n"
- "fmla z29.h, p3/M, z2.h, z16.h\n"
- "fmla z27.h, p3/M, z1.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x28]\n"
- "fmla z24.h, p3/M, z4.h, z23.h\n"
- "fmla z28.h, p3/M, z1.h, z17.h\n"
- "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
- "fmla z20.h, p3/M, z2.h, z23.h\n"
- "fmla z21.h, p3/M, z1.h, z23.h\n"
- "fmla z29.h, p3/M, z8.h, z23.h\n"
- "fmla z27.h, p3/M, z7.h, z23.h\n"
- "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "movprfx z24, z31\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z23, z31\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "add x6, x6, #0x1\n"
+ "add x20, x5, #0x1\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z22.h }, p2/Z, [x27, x11, LSL #1]\n"
+ "cmp x6, x22\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z13.h\n"
+ "csel x5, x5, x20, LT\n"
+ "fmla z26.h, p3/M, z1.h, z13.h\n"
+ "fmla z25.h, p3/M, z0.h, z13.h\n"
+ "csel x6, x6, XZR, LT\n"
+ "fmla z24.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "movprfx z21, z31\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z30.h, p3/M, z6.h, z17.h\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "cmp x5, x21\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x11, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z0.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z30.h, p3/M, z0.h, z18.h\n"
+ "fmla z29.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x28]\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "fmla z25.h, p3/M, z4.h, z22.h\n"
+ "fmla z23.h, p3/M, z1.h, z22.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "fmla z21.h, p3/M, z2.h, z22.h\n"
+ "fmla z27.h, p3/M, z0.h, z20.h\n"
+ "fmla z30.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "fmla z29.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z22.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x7, LSL #1]\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z23.h, p3/M, z3.h, z18.h\n"
+ "fmla z26.h, p3/M, z7.h, z18.h\n"
+ "fmla z24.h, p3/M, z5.h, z18.h\n"
+ "fmla z25.h, p3/M, z6.h, z18.h\n"
+ "fmla z27.h, p3/M, z8.h, z18.h\n"
+ "fmla z30.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z5.h, z17.h\n"
"fmla z26.h, p3/M, z0.h, z19.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "fmla z24.h, p3/M, z2.h, z16.h\n"
- "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x11, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x11, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x7, LSL #1]\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "fmla z23.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x11, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, x12, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z17.h\n"
"fmla z21.h, p3/M, z3.h, z17.h\n"
- "fmla z28.h, p3/M, z3.h, z19.h\n"
- "fmla z27.h, p3/M, z5.h, z16.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
- "fmla z26.h, p3/M, z6.h, z18.h\n"
- "fmla z25.h, p3/M, z7.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
- "fmla z22.h, p3/M, z5.h, z17.h\n"
- "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z27.h, p3/M, z7.h, z17.h\n"
+ "fmla z23.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x27]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z28.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z19.h\n"
"fmla z21.h, p3/M, z5.h, z19.h\n"
- "fmla z20.h, p3/M, z6.h, z16.h\n"
- "fmla z26.h, p3/M, z8.h, z17.h\n"
- "fmla z22.h, p3/M, z7.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z18.h\n"
- "fmla z25.h, p3/M, z0.h, z18.h\n"
- "fmla z24.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
- "fmla z20.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z7.h, z17.h\n"
- "fmla z28.h, p3/M, z4.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z26.h, p3/M, z1.h, z18.h\n"
- "fmla z29.h, p3/M, z5.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z16.h\n"
- "fmla z25.h, p3/M, z2.h, z16.h\n"
- "fmla z24.h, p3/M, z1.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z20.h, p3/M, z3.h, z17.h\n"
- "fmla z21.h, p3/M, z4.h, z19.h\n"
- "fmla z26.h, p3/M, z7.h, z17.h\n"
- "fmla z25.h, p3/M, z6.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x9]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z29.h, p3/M, z1.h, z16.h\n"
- "fmax z29.h, p3/M, z29.h, z31.h\n"
- "fmin z29.h, p3/M, z29.h, z30.h\n"
- "fmla z27.h, p3/M, z0.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
- "fmla z24.h, p3/M, z7.h, z19.h\n"
- "fmla z20.h, p3/M, z5.h, z19.h\n"
- "fmla z22.h, p3/M, z0.h, z18.h\n"
- "fmla z21.h, p3/M, z2.h, z17.h\n"
- "fmla z25.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
- "fmax z25.h, p3/M, z25.h, z31.h\n"
- "fmla z28.h, p3/M, z6.h, z18.h\n"
- "fmla z26.h, p3/M, z3.h, z18.h\n"
- "fmax z28.h, p3/M, z28.h, z31.h\n"
- "fmax z26.h, p3/M, z26.h, z31.h\n"
- "fmla z27.h, p3/M, z8.h, z17.h\n"
- "fmla z24.h, p3/M, z5.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z31.h\n"
- "fmax z24.h, p3/M, z24.h, z31.h\n"
- "fmla z22.h, p3/M, z8.h, z16.h\n"
- "fmla z20.h, p3/M, z7.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z31.h\n"
- "fmax z20.h, p3/M, z20.h, z31.h\n"
- "fmla z21.h, p3/M, z6.h, z16.h\n"
- "fmax z21.h, p3/M, z21.h, z31.h\n"
- "fmin z28.h, p3/M, z28.h, z30.h\n"
- "st1h { z28.h }, p0, [x11]\n"
- "fmin z27.h, p3/M, z27.h, z30.h\n"
- "fmin z26.h, p3/M, z26.h, z30.h\n"
- "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z30.h\n"
- "fmin z24.h, p3/M, z24.h, z30.h\n"
- "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
- "fmin z22.h, p3/M, z22.h, z30.h\n"
- "fmin z20.h, p3/M, z20.h, z30.h\n"
- "st1h { z26.h }, p0, [x24]\n"
- "fmin z21.h, p3/M, z21.h, z30.h\n"
- "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
- "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
- "st1h { z22.h }, p0, [x23]\n"
- "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
- "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+ "fmla z24.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x12, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z18.h\n"
+ "fmla z23.h, p3/M, z2.h, z17.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "fmla z23.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "st1h { z27.h }, p0, [x24]\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
+ "st1h { z26.h }, p0, [x24, x17, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z29.h }, p0, [x13]\n"
+ "st1h { z30.h }, p0, [x13, x17, LSL #1]\n"
+ "st1h { z28.h }, p0, [x13, x10, LSL #1]\n"
+ "st1h { z25.h }, p0, [x24, x10, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23]\n"
+ "st1h { z21.h }, p0, [x23, x17, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x10, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 4f8368acd5..ecf912303d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,384 +90,384 @@ void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
"add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z14.h }, p3/Z, [x8]\n"
- "cnth x16\n"
- "mov x15, #0x0\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "cnth x15\n"
+ "mov x14, #0x0\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ld1h { z15.h }, p3/Z, [x8]\n"
"ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
- "whilelt p2.h, XZR, %x[n_channels]\n"
"ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "sub x13, XZR, x15\n"
"ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
- "sub x14, XZR, x16\n"
"ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
"addvl x8, x8, #16\n"
- "ldp x24, x23, [x17, #0x0]\n"
- "ldp x22, x21, [x17, #0x10]\n"
- "ldr x20, [x17, #0x20]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x14, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
- "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
"addvl x8, x8, #-6\n"
- "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x14, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
- "ldr x23, [x17, #0x30]\n"
- "ldr x26, [x17, #0x38]\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ldr x22, [x17, #0x28]\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x27, [x17, #0x38]\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x26, [x17, #0x28]\n"
"ldr x21, [x17, #0x48]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z3.h, z9.h\n"
"ldr x20, [x17, #0x40]\n"
- "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
"ldr x25, [x17, #0x50]\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z0.h, z9.h\n"
"ldr x24, [x17, #0x58]\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
- "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
"ldr x23, [x17, #0x60]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "fmla z28.h, p3/M, z6.h, z18.h\n"
- "ldr x12, [x17, #0x70]\n"
- "ldr x11, [x17, #0x88]\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
- "fmla z27.h, p3/M, z3.h, z13.h\n"
- "inch x14\n"
+ "fmla z30.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z22.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x12, [x17, #0x88]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z13.h\n"
+ "ldr x22, [x17, #0x70]\n"
+ "fmla z26.h, p3/M, z1.h, z13.h\n"
+ "fmla z25.h, p3/M, z0.h, z13.h\n"
+ "inch x13\n"
"mov p1.b, p2.b\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z25.h, p3/M, z1.h, z13.h\n"
- "ldr x10, [x13, #0x0]\n"
- "whilelt p0.h, x16, %x[n_channels]\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
- "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
- "fmla z29.h, p3/M, z7.h, z18.h\n"
- "ldr x22, [x17, #0x68]\n"
- "ldr x21, [x17, #0x78]\n"
- "fmla z28.h, p3/M, z0.h, z17.h\n"
- "fmla z22.h, p3/M, z8.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z17.h\n"
+ "ldr x11, [x16, #0x0]\n"
+ "whilelt p0.h, x15, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "ldr x10, [x17, #0x78]\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
"ldr x20, [x17, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z18.h\n"
- "fmla z25.h, p3/M, z3.h, z18.h\n"
- "ldr x9, [x13, #0x8]\n"
- "ldr x28, [x13, #0x10]\n"
- "fmla z21.h, p3/M, z0.h, z18.h\n"
- "fmla z24.h, p3/M, z4.h, z19.h\n"
- "ldr x27, [x13, #0x18]\n"
- "ld1h { z14.h }, p3/Z, [x8]\n"
- "fmla z23.h, p3/M, z1.h, z18.h\n"
- "fmla z29.h, p3/M, z1.h, z17.h\n"
- "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z27.h, p3/M, z1.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z21.h, p3/M, z0.h, z17.h\n"
+ "fmla z25.h, p3/M, z4.h, z22.h\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "fmla z30.h, p3/M, z7.h, z17.h\n"
+ "fmla z29.h, p3/M, z0.h, z18.h\n"
+ "ldr x27, [x16, #0x18]\n"
+ "ld1h { z15.h }, p3/Z, [x8]\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x14, LSL #1]\n"
"ldr x26, [x17, #0x90]\n"
- "fmla z25.h, p3/M, z5.h, z19.h\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "ldr x25, [x17, #0xa0]\n"
- "ldr x24, [x17, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z20.h\n"
- "fmla z24.h, p3/M, z2.h, z17.h\n"
- "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "fmla z23.h, p3/M, z1.h, z22.h\n"
+ "fmla z21.h, p3/M, z2.h, z22.h\n"
+ "fmla z30.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z20.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z28.h, p3/M, z7.h, z22.h\n"
+ "fmla z25.h, p3/M, z2.h, z20.h\n"
+ "fmla z24.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ldr x23, [x17, #0xb0]\n"
+ "fmla z29.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x22, [x17, #0xa8]\n"
+ "fmla z27.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ldr x21, [x17, #0xc0]\n"
+ "fmla z28.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmla z23.h, p3/M, z3.h, z17.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z30.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "fmla z26.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x17, #0x20]\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x14, LSL #1]\n"
"fmla z27.h, p3/M, z7.h, z19.h\n"
- "fmla z22.h, p3/M, z1.h, z19.h\n"
- "fmla z23.h, p3/M, z3.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x23, [x17, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z30.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z26.h, p3/M, z6.h, z19.h\n"
+ "fmla z28.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z21.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x14, LSL #1]\n"
"fmla z25.h, p3/M, z7.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
- "ldr x22, [x17, #0xc0]\n"
- "fmla z24.h, p3/M, z6.h, z18.h\n"
- "fmla z21.h, p3/M, z4.h, z18.h\n"
- "fmla z29.h, p3/M, z3.h, z20.h\n"
- "fmla z27.h, p3/M, z5.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z18.h\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "ldr x21, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "fmla z23.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "fmla z24.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
"fmla z26.h, p3/M, z8.h, z18.h\n"
- "fmla z24.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z6.h, z16.h\n"
- "fmla z28.h, p3/M, z3.h, z19.h\n"
- "fmla z25.h, p3/M, z0.h, z19.h\n"
- "fmla z22.h, p3/M, z5.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z19.h\n"
- "fmla z26.h, p3/M, z1.h, z19.h\n"
- "fmla z28.h, p3/M, z5.h, z17.h\n"
- "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z17.h\n"
- "fmla z25.h, p3/M, z2.h, z17.h\n"
- "fmla z24.h, p3/M, z1.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ldr x25, [x17, #0x20]\n"
- "fmla z22.h, p3/M, z7.h, z18.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z17.h\n"
- "fmla z26.h, p3/M, z7.h, z16.h\n"
- "fmla z25.h, p3/M, z6.h, z16.h\n"
- "fmla z23.h, p3/M, z4.h, z16.h\n"
- "fmla z21.h, p3/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z18.h\n"
- "fmla z28.h, p3/M, z1.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z31.h\n"
- "fmin z28.h, p3/M, z28.h, z30.h\n"
- "fmla z27.h, p3/M, z0.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z16.h\n"
- "fmax z29.h, p3/M, z29.h, z31.h\n"
- "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z27.h, p3/M, z3.h, z16.h\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
- "fmin z29.h, p3/M, z29.h, z30.h\n"
- "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z0.h, z16.h\n"
- "fmla z22.h, p3/M, z2.h, z17.h\n"
- "ldr x24, [x13, #0x20]\n"
- "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z18.h\n"
- "fmla z26.h, p3/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
"ldp x23, x22, [x17, #0x0]\n"
- "fmla z27.h, p3/M, z8.h, z17.h\n"
- "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmla z23.h, p3/M, z2.h, z17.h\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
"ldp x21, x20, [x17, #0x10]\n"
- "fmax z27.h, p3/M, z27.h, z31.h\n"
- "fmla z23.h, p3/M, z8.h, z16.h\n"
- "fmla z21.h, p3/M, z7.h, z16.h\n"
- "fmax z26.h, p3/M, z26.h, z31.h\n"
- "fmax z25.h, p3/M, z25.h, z31.h\n"
- "fmla z22.h, p3/M, z6.h, z16.h\n"
- "inch x15\n"
- "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
- "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z30.h\n"
- "fmin z26.h, p3/M, z26.h, z30.h\n"
- "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
- "inch x16\n"
- "fmin z25.h, p3/M, z25.h, z30.h\n"
- "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
- "fmax z24.h, p3/M, z24.h, z31.h\n"
- "fmax z23.h, p3/M, z23.h, z31.h\n"
- "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
- "ldr x23, [x13, #0x28]\n"
- "fmax z21.h, p3/M, z21.h, z31.h\n"
- "fmax z22.h, p3/M, z22.h, z31.h\n"
- "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
- "ldr x22, [x13, #0x30]\n"
- "ldr x21, [x13, #0x38]\n"
- "ldr x20, [x13, #0x40]\n"
- "whilelt p2.h, x15, %x[n_channels]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
- "fmin z24.h, p3/M, z24.h, z30.h\n"
- "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "inch x14\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "fmin z29.h, p3/M, z29.h, z31.h\n"
"ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
- "fmin z21.h, p3/M, z21.h, z30.h\n"
- "fmin z22.h, p3/M, z22.h, z30.h\n"
- "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
"ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
- "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "ld1h { z9.h }, p0/Z, [x23, x15, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z31.h\n"
+ "ld1h { z10.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z11.h }, p0/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z12.h }, p0/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z13.h }, p0/Z, [x24, x15, LSL #1]\n"
+ "inch x15\n"
+ "fmin z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z27.h, p3/M, z27.h, z31.h\n"
+ "st1h { z29.h }, p1, [x9, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x28]\n"
+ "st1h { z30.h }, p1, [x11, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmin z26.h, p3/M, z26.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
"ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z28.h }, p1, [x28, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x30]\n"
"addvl x8, x8, #16\n"
- "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+ "st1h { z27.h }, p1, [x27, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x38]\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "st1h { z26.h }, p1, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x40]\n"
+ "fmin z25.h, p3/M, z25.h, z31.h\n"
+ "fmin z24.h, p3/M, z24.h, z31.h\n"
+ "fmin z21.h, p3/M, z21.h, z31.h\n"
+ "fmin z23.h, p3/M, z23.h, z31.h\n"
"ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
- "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
"ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
"addvl x8, x8, #-6\n"
- "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
+ "st1h { z25.h }, p1, [x23, x13, LSL #1]\n"
+ "st1h { z24.h }, p1, [x22, x13, LSL #1]\n"
+ "st1h { z21.h }, p1, [x21, x13, LSL #1]\n"
+ "st1h { z23.h }, p1, [x20, x13, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
- "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
- "ldr x23, [x17, #0x30]\n"
- "ldr x26, [x17, #0x38]\n"
- "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
- "fmla z29.h, p3/M, z0.h, z10.h\n"
- "ldr x22, [x17, #0x28]\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x27, [x17, #0x38]\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "ldr x26, [x17, #0x28]\n"
"ldr x21, [x17, #0x48]\n"
- "fmla z28.h, p3/M, z4.h, z13.h\n"
- "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z4.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z3.h, z9.h\n"
"ldr x20, [x17, #0x40]\n"
- "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
- "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
- "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
"ldr x25, [x17, #0x50]\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z0.h, z9.h\n"
"ldr x24, [x17, #0x58]\n"
- "fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
- "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
"ldr x23, [x17, #0x60]\n"
- "fmla z29.h, p3/M, z5.h, z13.h\n"
- "fmla z28.h, p3/M, z6.h, z18.h\n"
- "ldr x12, [x17, #0x70]\n"
- "ldr x11, [x17, #0x88]\n"
- "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
- "fmla z27.h, p3/M, z3.h, z13.h\n"
- "inch x14\n"
+ "fmla z30.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z22.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x12, [x17, #0x88]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z13.h\n"
+ "ldr x22, [x17, #0x70]\n"
+ "fmla z26.h, p3/M, z1.h, z13.h\n"
+ "fmla z25.h, p3/M, z0.h, z13.h\n"
+ "inch x13\n"
"mov p0.b, p2.b\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z25.h, p3/M, z1.h, z13.h\n"
- "ldr x10, [x13, #0x0]\n"
- "ldr x9, [x13, #0x8]\n"
- "fmla z24.h, p3/M, z0.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z6.h, z12.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
- "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
- "fmla z29.h, p3/M, z7.h, z18.h\n"
- "ldr x22, [x17, #0x68]\n"
- "ldr x21, [x17, #0x78]\n"
- "fmla z28.h, p3/M, z0.h, z17.h\n"
- "fmla z22.h, p3/M, z8.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "movprfx z21, z15\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z17.h\n"
+ "ldr x11, [x16, #0x0]\n"
+ "ldr x10, [x16, #0x8]\n"
+ "fmla z28.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "ldr x9, [x17, #0x78]\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
"ldr x20, [x17, #0x80]\n"
- "fmla z26.h, p3/M, z4.h, z18.h\n"
- "fmla z25.h, p3/M, z3.h, z18.h\n"
- "ldr x28, [x13, #0x10]\n"
- "ldr x27, [x13, #0x18]\n"
- "fmla z21.h, p3/M, z0.h, z18.h\n"
- "fmla z24.h, p3/M, z4.h, z19.h\n"
- "fmla z23.h, p3/M, z1.h, z18.h\n"
- "fmla z29.h, p3/M, z1.h, z17.h\n"
- "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
- "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z16.h\n"
- "fmla z27.h, p3/M, z1.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z21.h, p3/M, z0.h, z17.h\n"
+ "fmla z25.h, p3/M, z4.h, z22.h\n"
+ "ldr x28, [x16, #0x10]\n"
+ "ldr x27, [x16, #0x18]\n"
+ "fmla z30.h, p3/M, z7.h, z17.h\n"
+ "fmla z29.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x14, LSL #1]\n"
"ldr x26, [x17, #0x90]\n"
- "fmla z25.h, p3/M, z5.h, z19.h\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "ldr x25, [x17, #0xa0]\n"
- "ldr x24, [x17, #0x98]\n"
- "fmla z26.h, p3/M, z0.h, z20.h\n"
- "fmla z24.h, p3/M, z2.h, z17.h\n"
- "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "fmla z23.h, p3/M, z1.h, z22.h\n"
+ "fmla z21.h, p3/M, z2.h, z22.h\n"
+ "fmla z30.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z20.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z28.h, p3/M, z7.h, z22.h\n"
+ "fmla z25.h, p3/M, z2.h, z20.h\n"
+ "fmla z24.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z19.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ldr x23, [x17, #0xb0]\n"
+ "fmla z29.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x22, [x17, #0xa8]\n"
+ "fmla z27.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ldr x21, [x17, #0xc0]\n"
+ "fmla z28.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmla z23.h, p3/M, z3.h, z17.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z30.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "fmla z26.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z16.h\n"
+ "fmla z26.h, p3/M, z2.h, z16.h\n"
+ "fmla z25.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x14, LSL #1]\n"
"fmla z27.h, p3/M, z7.h, z19.h\n"
- "fmla z22.h, p3/M, z1.h, z19.h\n"
- "fmla z23.h, p3/M, z3.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
- "ldr x23, [x17, #0xa8]\n"
- "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z30.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmla z26.h, p3/M, z6.h, z19.h\n"
+ "fmla z28.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z21.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x14, LSL #1]\n"
"fmla z25.h, p3/M, z7.h, z18.h\n"
- "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
- "ldr x22, [x17, #0xc0]\n"
- "fmla z24.h, p3/M, z6.h, z18.h\n"
- "fmla z21.h, p3/M, z4.h, z18.h\n"
- "fmla z29.h, p3/M, z3.h, z20.h\n"
- "fmla z27.h, p3/M, z5.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z5.h, z18.h\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "ldr x21, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "fmla z23.h, p3/M, z4.h, z18.h\n"
+ "fmax z29.h, p3/M, z29.h, z14.h\n"
+ "fmla z30.h, p3/M, z6.h, z16.h\n"
+ "fmla z24.h, p3/M, z0.h, z16.h\n"
"fmla z26.h, p3/M, z8.h, z18.h\n"
- "fmla z24.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z6.h, z16.h\n"
- "fmla z28.h, p3/M, z3.h, z19.h\n"
- "fmla z25.h, p3/M, z0.h, z19.h\n"
- "fmla z22.h, p3/M, z5.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z4.h, z19.h\n"
- "fmla z26.h, p3/M, z1.h, z19.h\n"
- "fmla z28.h, p3/M, z5.h, z17.h\n"
- "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z17.h\n"
- "fmla z25.h, p3/M, z2.h, z17.h\n"
- "fmla z24.h, p3/M, z1.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z18.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z2.h, z17.h\n"
- "fmla z26.h, p3/M, z7.h, z16.h\n"
- "fmla z25.h, p3/M, z6.h, z16.h\n"
- "fmla z23.h, p3/M, z4.h, z16.h\n"
- "fmla z21.h, p3/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z18.h\n"
- "fmla z28.h, p3/M, z1.h, z17.h\n"
- "fmax z28.h, p3/M, z28.h, z31.h\n"
- "fmin z28.h, p3/M, z28.h, z30.h\n"
- "fmla z27.h, p3/M, z0.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
- "fmla z29.h, p3/M, z6.h, z16.h\n"
- "fmax z29.h, p3/M, z29.h, z31.h\n"
- "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z27.h, p3/M, z3.h, z16.h\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
"fmla z21.h, p3/M, z5.h, z18.h\n"
- "fmin z29.h, p3/M, z29.h, z30.h\n"
- "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z0.h, z16.h\n"
- "fmla z22.h, p3/M, z2.h, z17.h\n"
- "ldr x20, [x13, #0x20]\n"
- "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z8.h, z18.h\n"
- "fmla z26.h, p3/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z31.h\n"
- "fmla z27.h, p3/M, z8.h, z17.h\n"
- "fmla z24.h, p3/M, z5.h, z17.h\n"
- "fmax z27.h, p3/M, z27.h, z31.h\n"
- "fmax z25.h, p3/M, z25.h, z31.h\n"
- "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
+ "fmin z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z23.h, p3/M, z2.h, z17.h\n"
+ "fmax z30.h, p3/M, z30.h, z14.h\n"
+ "fmax z26.h, p3/M, z26.h, z14.h\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z14.h\n"
+ "fmax z27.h, p3/M, z27.h, z14.h\n"
+ "st1h { z29.h }, p0, [x10, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x28]\n"
"fmla z21.h, p3/M, z7.h, z16.h\n"
- "fmin z27.h, p3/M, z27.h, z30.h\n"
- "fmin z26.h, p3/M, z26.h, z30.h\n"
- "fmla z22.h, p3/M, z6.h, z16.h\n"
- "fmin z25.h, p3/M, z25.h, z30.h\n"
- "fmax z24.h, p3/M, z24.h, z31.h\n"
- "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z31.h\n"
- "fmax z21.h, p3/M, z21.h, z31.h\n"
- "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
- "ldr x23, [x13, #0x28]\n"
- "fmax z22.h, p3/M, z22.h, z31.h\n"
- "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
- "ldr x22, [x13, #0x30]\n"
- "ldr x21, [x13, #0x38]\n"
- "ldr x20, [x13, #0x40]\n"
- "fmin z24.h, p3/M, z24.h, z30.h\n"
- "fmin z23.h, p3/M, z23.h, z30.h\n"
- "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z30.h\n"
- "fmin z22.h, p3/M, z22.h, z30.h\n"
- "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
- "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
- "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
+ "fmin z30.h, p3/M, z30.h, z31.h\n"
+ "fmin z26.h, p3/M, z26.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z14.h\n"
+ "fmla z23.h, p3/M, z6.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z30.h }, p0, [x11, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmax z21.h, p3/M, z21.h, z14.h\n"
+ "st1h { z28.h }, p0, [x28, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x30]\n"
+ "fmin z25.h, p3/M, z25.h, z31.h\n"
+ "fmax z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z27.h }, p0, [x27, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x38]\n"
+ "st1h { z26.h }, p0, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z31.h\n"
+ "fmin z21.h, p3/M, z21.h, z31.h\n"
+ "st1h { z25.h }, p0, [x23, x13, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z31.h\n"
+ "st1h { z24.h }, p0, [x22, x13, LSL #1]\n"
+ "st1h { z21.h }, p0, [x21, x13, LSL #1]\n"
+ "st1h { z23.h }, p0, [x20, x13, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 41eaa4f18c..d71286f6c5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,565 +88,565 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x16, #0x0\n"
- "mov x4, #0x0\n"
+ "mov x1, #0x0\n"
+ "mov x2, #0x0\n"
"1:" // Tile loop
- "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x20, #0x4\n"
"mov x25, #0x4\n"
- "mov x24, #0x4\n"
- "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
- "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
- "add x7, x5, x5\n"
- "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "cnth x16\n"
- "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
- "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x14, x7, x5\n"
+ "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "cnth x3\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x8, x8, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x13, x8, x23, LSL #1\n"
- "ld1h { z19.h }, p3/Z, [x17]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "add x12, x13, x23, LSL #1\n"
- "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "add x11, x12, x23, LSL #1\n"
- "add x10, x14, x5\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "add x9, x15, x22, LSL #1\n"
- "add x28, x11, x23, LSL #1\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "add x27, x10, x5\n"
- "add x26, x9, x22, LSL #1\n"
- "add x25, x6, x6\n"
- "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x16, %x[n_channels]\n"
- "add x24, x28, x23, LSL #1\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "add x23, x26, x22, LSL #1\n"
- "add x22, x25, x6\n"
- "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x8]\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x16\n"
- "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
- "addvl x17, x17, #-6\n"
+ "mov x6, #0x0\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x1, x24\n" // offset = tile_i * ld_input_row
+ "mul x21, x1, x23\n" // offset = tile_i * ld_output_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cmp x3, %x[n_channels]\n"
+ "ld1rh { z27.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x16, x4, x4\n"
+ "add x15, x5, x5\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "madd x22, x2, x4, x22\n" // offset += tile_j * ld_input_col
+ "add x14, x16, x4\n"
+ "ld1h { z13.h }, p3/Z, [x8]\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "add x13, x15, x5\n"
+ "madd x21, x2, x5, x21\n" // offset += tile_j * ld_output_col
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "add x12, x14, x4\n"
+ "mul x22, x22, x20\n" // offset *= kernel_stride * output_size
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "add x11, x12, x4\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "sub x20, XZR, x3\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x7, x7, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x10, x7, x24, LSL #1\n"
+ "add x9, x10, x24, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x7]\n"
+ "ld1h { z11.h }, p2/Z, [x7, x11, LSL #1]\n"
+ "add x28, x9, x24, LSL #1\n"
+ "add x27, x28, x24, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "add x17, x17, x21, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x26, x27, x24, LSL #1\n"
+ "ld1h { z9.h }, p2/Z, [x9, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x9, x14, LSL #1]\n"
+ "add x25, x17, x23, LSL #1\n"
+ "add x24, x25, x23, LSL #1\n"
+ "add x23, x24, x23, LSL #1\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "whilelt p1.h, x16, %x[n_channels]\n"
- "inch x21\n"
- "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
- "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
- "inch x16\n"
+ "movprfx z14, z13\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z19, z13\n fmla z19.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x3, %x[n_channels]\n"
+ "inch x6\n"
+ "movprfx z18, z13\n fmla z18.h, p3/M, z3.h, z9.h\n"
+ "movprfx z26, z13\n fmla z26.h, p3/M, z1.h, z9.h\n"
+ "inch x3\n"
"mov p0.b, p2.b\n"
- "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "movprfx z15, z13\n fmla z15.h, p3/M, z0.h, z9.h\n"
+ "movprfx z30, z13\n fmla z30.h, p3/M, z7.h, z9.h\n"
"inch x20\n"
- "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
- "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
- "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
- "ld1h { z29.h }, p2/Z, [x24]\n"
- "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z21.h, p3/M, z4.h, z12.h\n"
- "fmla z22.h, p3/M, z2.h, z12.h\n"
- "fmla z20.h, p3/M, z1.h, z12.h\n"
- "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
- "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "movprfx z28, z13\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "movprfx z21, z13\n fmla z21.h, p3/M, z5.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "movprfx z24, z13\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z19.h, p3/M, z0.h, z10.h\n"
+ "movprfx z22, z13\n fmla z22.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x11, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z15.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "movprfx z25, z13\n fmla z25.h, p3/M, z6.h, z11.h\n"
"fmla z14.h, p3/M, z7.h, z9.h\n"
- "fmla z13.h, p3/M, z8.h, z12.h\n"
- "fmla z17.h, p3/M, z7.h, z12.h\n"
- "fmla z30.h, p3/M, z6.h, z12.h\n"
- "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
- "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
- "fmla z21.h, p3/M, z6.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z22.h, p3/M, z4.h, z9.h\n"
- "fmla z20.h, p3/M, z3.h, z9.h\n"
- "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
- "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
- "ld1h { z19.h }, p3/Z, [x17]\n"
- "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "movprfx z31, z13\n fmla z31.h, p3/M, z3.h, z12.h\n"
+ "movprfx z17, z13\n fmla z17.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x4, LSL #1]\n"
+ "movprfx z20, z13\n fmla z20.h, p3/M, z8.h, z10.h\n"
+ "fmla z18.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x7, x12, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z9.h\n"
+ "fmla z15.h, p3/M, z3.h, z9.h\n"
+ "movprfx z16, z13\n fmla z16.h, p3/M, z1.h, z9.h\n"
+ "movprfx z23, z13\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z13.h }, p3/Z, [x8]\n"
+ "fmla z21.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z25.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x10]\n"
+ "fmla z19.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z10.h\n"
+ "fmla z22.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27]\n"
+ "fmla z18.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z15.h, p3/M, z4.h, z11.h\n"
+ "fmla z17.h, p3/M, z3.h, z11.h\n"
+ "fmla z16.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmla z20.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x10, x16, LSL #1]\n"
+ "fmla z21.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x11, LSL #1]\n"
+ "fmla z19.h, p3/M, z3.h, z9.h\n"
+ "fmla z14.h, p3/M, z1.h, z11.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "fmla z18.h, p3/M, z0.h, z11.h\n"
+ "fmla z17.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x4, LSL #1]\n"
+ "fmla z21.h, p3/M, z2.h, z11.h\n"
+ "fmla z14.h, p3/M, z2.h, z12.h\n"
+ "fmla z19.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z22.h, p3/M, z3.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmla z21.h, p3/M, z4.h, z11.h\n"
+ "fmla z14.h, p3/M, z3.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z19.h, p3/M, z7.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "fmla z20.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z9.h\n"
+ "fmla z22.h, p3/M, z7.h, z9.h\n"
"fmla z18.h, p3/M, z5.h, z9.h\n"
- "fmla z23.h, p3/M, z2.h, z9.h\n"
- "fmla z14.h, p3/M, z8.h, z10.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z13.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z11.h\n"
- "fmla z30.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28]\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "fmla z15.h, p3/M, z2.h, z9.h\n"
+ "fmla z17.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x7, x14, LSL #1]\n"
+ "addvl x7, x7, #1\n"
"fmla z21.h, p3/M, z7.h, z10.h\n"
- "fmla z26.h, p3/M, z6.h, z10.h\n"
- "fmla z22.h, p3/M, z5.h, z10.h\n"
- "fmla z20.h, p3/M, z4.h, z10.h\n"
- "fmla z28.h, p3/M, z3.h, z10.h\n"
- "fmla z25.h, p3/M, z2.h, z10.h\n"
- "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z14.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z10.h\n"
+ "fmla z26.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z1.h, z10.h\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z19.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x9]\n"
+ "fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmla z15.h, p3/M, z5.h, z12.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
"fmla z24.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z27.h, p3/M, z0.h, z9.h\n"
- "fmla z18.h, p3/M, z6.h, z11.h\n"
- "fmla z23.h, p3/M, z3.h, z11.h\n"
- "fmla z14.h, p3/M, z1.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "fmla z30.h, p3/M, z5.h, z12.h\n"
- "fmla z26.h, p3/M, z2.h, z12.h\n"
- "fmla z13.h, p3/M, z4.h, z10.h\n"
- "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z17.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z0.h, z10.h\n"
- "fmla z28.h, p3/M, z8.h, z11.h\n"
- "fmla z24.h, p3/M, z5.h, z11.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z27.h, p3/M, z2.h, z10.h\n"
- "fmla z14.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z5.h, z10.h\n"
- "fmla z13.h, p3/M, z5.h, z9.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
- "fmla z17.h, p3/M, z4.h, z9.h\n"
- "fmla z30.h, p3/M, z3.h, z9.h\n"
- "fmla z21.h, p3/M, z1.h, z9.h\n"
- "fmla z26.h, p3/M, z0.h, z9.h\n"
- "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z12.h\n"
- "fmla z25.h, p3/M, z6.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z27.h, p3/M, z4.h, z11.h\n"
- "fmla z14.h, p3/M, z3.h, z11.h\n"
- "fmla z18.h, p3/M, z1.h, z11.h\n"
- "fmla z22.h, p3/M, z0.h, z11.h\n"
- "fmla z31.h, p3/M, z7.h, z11.h\n"
- "fmla z13.h, p3/M, z6.h, z11.h\n"
- "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z8.h, z12.h\n"
- "fmla z24.h, p3/M, z7.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
- "fmla z17.h, p3/M, z8.h, z10.h\n"
- "fmla z30.h, p3/M, z7.h, z10.h\n"
- "fmla z21.h, p3/M, z5.h, z10.h\n"
- "fmla z26.h, p3/M, z4.h, z10.h\n"
- "fmla z20.h, p3/M, z2.h, z10.h\n"
- "fmla z28.h, p3/M, z1.h, z10.h\n"
- "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
- "addvl x8, x8, #1\n"
- "fmla z27.h, p3/M, z7.h, z12.h\n"
- "fmla z14.h, p3/M, z6.h, z12.h\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmla z22.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z1.h, z12.h\n"
- "fmla z25.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z9.h\n"
- "fmla z13.h, p3/M, z1.h, z9.h\n"
- "fmla z17.h, p3/M, z0.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x12]\n"
- "fmla z29.h, p3/M, z2.h, z12.h\n"
- "fmla z30.h, p3/M, z0.h, z11.h\n"
- "fmla z27.h, p3/M, z3.h, z9.h\n"
- "fmla z18.h, p3/M, z0.h, z9.h\n"
- "fmla z21.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z28.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x11, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z19.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28]\n"
+ "fmla z16.h, p3/M, z4.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z12.h\n"
"fmla z26.h, p3/M, z7.h, z12.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
- "fmla z24.h, p3/M, z1.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z13.h, p3/M, z2.h, z11.h\n"
- "fmla z17.h, p3/M, z1.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
- "addvl x12, x12, #1\n"
- "fmla z31.h, p3/M, z6.h, z9.h\n"
- "ld1h { z12.h }, p2/Z, [x11]\n"
- "fmla z25.h, p3/M, z4.h, z10.h\n"
- "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
- "fmla z29.h, p3/M, z3.h, z10.h\n"
- "fmla z30.h, p3/M, z8.h, z11.h\n"
- "fmla z26.h, p3/M, z5.h, z11.h\n"
- "fmla z28.h, p3/M, z2.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
- "addvl x11, x11, #1\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "fmla z18.h, p3/M, z3.h, z12.h\n"
- "fmla z23.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z11.h\n"
- "fmla z25.h, p3/M, z7.h, z12.h\n"
- "fmla z29.h, p3/M, z6.h, z12.h\n"
- "fmla z18.h, p3/M, z8.h, z10.h\n"
- "fmla z22.h, p3/M, z7.h, z10.h\n"
- "fmla z20.h, p3/M, z6.h, z10.h\n"
- "fmla z23.h, p3/M, z5.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z5.h, z11.h\n"
- "fmla z25.h, p3/M, z5.h, z10.h\n"
- "fmla z29.h, p3/M, z4.h, z10.h\n"
- "fmla z24.h, p3/M, z3.h, z10.h\n"
- "fmla z26.h, p3/M, z8.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z10.h\n"
- "fmla z20.h, p3/M, z7.h, z10.h\n"
- "addvl x24, x24, #1\n"
- "fmla z28.h, p3/M, z6.h, z10.h\n"
- "fmla z25.h, p3/M, z8.h, z11.h\n"
- "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
- "addvl x13, x13, #1\n"
- "fmla z29.h, p3/M, z7.h, z11.h\n"
- "fmla z24.h, p3/M, z6.h, z11.h\n"
- "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z12.h\n"
- "fmla z13.h, p3/M, z3.h, z12.h\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "fmax z13.h, p3/M, z13.h, z15.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "fmla z14.h, p3/M, z0.h, z12.h\n"
- "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "fmla z17.h, p3/M, z5.h, z10.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmla z21.h, p3/M, z2.h, z10.h\n"
- "fmla z26.h, p3/M, z1.h, z10.h\n"
- "fmax z14.h, p3/M, z14.h, z15.h\n"
- "fmax z21.h, p3/M, z21.h, z15.h\n"
- "fmla z18.h, p3/M, z7.h, z11.h\n"
- "fmla z22.h, p3/M, z6.h, z11.h\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "fmla z23.h, p3/M, z4.h, z11.h\n"
- "fmla z25.h, p3/M, z3.h, z11.h\n"
- "fmax z22.h, p3/M, z22.h, z15.h\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "fmla z20.h, p3/M, z8.h, z0.h\n"
- "fmla z28.h, p3/M, z7.h, z0.h\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmla z29.h, p3/M, z5.h, z0.h\n"
- "fmla z24.h, p3/M, z4.h, z0.h\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z13.h, p3/M, z13.h, z16.h\n"
- "fmin z17.h, p3/M, z17.h, z16.h\n"
- "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "ld1h { z10.h }, p1/Z, [x8]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z14.h, p3/M, z14.h, z16.h\n"
- "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
- "ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "st1h { z31.h }, p0, [x15]\n"
- "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z18.h, p3/M, z18.h, z16.h\n"
- "fmin z22.h, p3/M, z22.h, z16.h\n"
- "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z20.h, p3/M, z20.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
- "fmin z29.h, p3/M, z29.h, z16.h\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "st1h { z27.h }, p0, [x9]\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z9.h }, p1/Z, [x9, x16, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x11, LSL #1]\n"
"addvl x28, x28, #1\n"
- "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
- "addvl x15, x15, #1\n"
- "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
- "addvl x17, x17, #-6\n"
- "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
- "addvl x9, x9, #1\n"
- "st1h { z18.h }, p0, [x26]\n"
- "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z6.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z11.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x14, LSL #1]\n"
"addvl x26, x26, #1\n"
- "st1h { z23.h }, p0, [x23]\n"
- "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z12.h\n"
+ "fmla z25.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z16.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z8.h, z12.h\n"
+ "fmla z15.h, p3/M, z7.h, z12.h\n"
+ "fmla z17.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x10, x12, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x10, x4, LSL #1]\n"
+ "addvl x10, x10, #1\n"
+ "fmla z16.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z20.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z19.h, p3/M, z4.h, z10.h\n"
+ "fmla z30.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z14.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x12, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z18.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmax z28.h, p3/M, z28.h, z27.h\n"
+ "fmax z22.h, p3/M, z22.h, z27.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z3.h, z11.h\n"
+ "fmax z19.h, p3/M, z19.h, z27.h\n"
+ "fmax z30.h, p3/M, z30.h, z27.h\n"
+ "fmla z15.h, p3/M, z8.h, z10.h\n"
+ "fmla z17.h, p3/M, z7.h, z10.h\n"
+ "fmax z21.h, p3/M, z21.h, z27.h\n"
+ "fmax z14.h, p3/M, z14.h, z27.h\n"
+ "fmla z23.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmax z18.h, p3/M, z18.h, z27.h\n"
+ "fmax z31.h, p3/M, z31.h, z27.h\n"
+ "fmax z24.h, p3/M, z24.h, z27.h\n"
+ "fmax z26.h, p3/M, z26.h, z27.h\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmax z25.h, p3/M, z25.h, z27.h\n"
+ "fmax z16.h, p3/M, z16.h, z27.h\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "fmax z15.h, p3/M, z15.h, z27.h\n"
+ "fmax z17.h, p3/M, z17.h, z27.h\n"
+ "ld1h { z10.h }, p1/Z, [x7]\n"
+ "ld1h { z11.h }, p1/Z, [x7, x11, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z27.h\n"
+ "fmax z20.h, p3/M, z20.h, z27.h\n"
+ "ld1h { z12.h }, p1/Z, [x9, x14, LSL #1]\n"
+ "addvl x8, x8, #16\n"
+ "whilelt p2.h, x6, %x[n_channels]\n"
+ "cmp x3, %x[n_channels]\n"
+ "fmin z19.h, p3/M, z19.h, z29.h\n"
+ "fmin z30.h, p3/M, z30.h, z29.h\n"
+ "fmin z28.h, p3/M, z28.h, z29.h\n"
+ "fmin z22.h, p3/M, z22.h, z29.h\n"
+ "fmin z21.h, p3/M, z21.h, z29.h\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "fmin z14.h, p3/M, z14.h, z29.h\n"
+ "fmin z18.h, p3/M, z18.h, z29.h\n"
+ "st1h { z19.h }, p0, [x17]\n"
+ "fmin z31.h, p3/M, z31.h, z29.h\n"
+ "fmin z24.h, p3/M, z24.h, z29.h\n"
+ "st1h { z30.h }, p0, [x17, x5, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z29.h\n"
+ "fmin z15.h, p3/M, z15.h, z29.h\n"
+ "st1h { z28.h }, p0, [x17, x15, LSL #1]\n"
+ "fmin z17.h, p3/M, z17.h, z29.h\n"
+ "fmin z25.h, p3/M, z25.h, z29.h\n"
+ "st1h { z22.h }, p0, [x17, x13, LSL #1]\n"
+ "fmin z16.h, p3/M, z16.h, z29.h\n"
+ "fmin z23.h, p3/M, z23.h, z29.h\n"
+ "st1h { z21.h }, p0, [x25]\n"
+ "fmin z20.h, p3/M, z20.h, z29.h\n"
+ "addvl x27, x27, #1\n"
+ "st1h { z14.h }, p0, [x25, x5, LSL #1]\n"
+ "st1h { z18.h }, p0, [x25, x15, LSL #1]\n"
+ "addvl x17, x17, #1\n"
+ "addvl x8, x8, #-6\n"
+ "st1h { z31.h }, p0, [x25, x13, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "st1h { z24.h }, p0, [x24]\n"
+ "st1h { z26.h }, p0, [x24, x5, LSL #1]\n"
+ "st1h { z15.h }, p0, [x24, x15, LSL #1]\n"
+ "st1h { z17.h }, p0, [x24, x13, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z25.h }, p0, [x23]\n"
+ "st1h { z16.h }, p0, [x23, x5, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x15, LSL #1]\n"
+ "st1h { z20.h }, p0, [x23, x13, LSL #1]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
- "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
- "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x4, x4, #0x1\n"
- "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
- "fmla z14.h, p3/M, z5.h, z12.h\n"
- "cmp x4, x20\n"
- "add x21, x16, #0x1\n"
- "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
- "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x16, x16, x21, LT\n"
- "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
- "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "movprfx z14, z13\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z18, z13\n fmla z18.h, p3/M, z8.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z23, z13\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z13\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "movprfx z20, z13\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "movprfx z25, z13\n fmla z25.h, p3/M, z7.h, z9.h\n"
"mov p0.b, p2.b\n"
- "fmla z31.h, p3/M, z0.h, z10.h\n"
- "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
- "ld1h { z29.h }, p2/Z, [x24]\n"
- "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
- "fmla z30.h, p3/M, z4.h, z12.h\n"
- "fmla z13.h, p3/M, z2.h, z12.h\n"
- "csel x4, x4, XZR, LT\n"
- "cmp x16, x20\n"
+ "movprfx z19, z13\n fmla z19.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z13\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "add x2, x2, #0x1\n"
+ "add x20, x1, #0x1\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "movprfx z28, z13\n fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z15.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "cmp x2, x22\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "movprfx z9, z13\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "ld1h { z24.h }, p2/Z, [x26, x11, LSL #1]\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "csel x1, x1, x20, LT\n"
+ "csel x2, x2, XZR, LT\n"
"fmla z20.h, p3/M, z1.h, z12.h\n"
- "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
- "fmla z14.h, p3/M, z7.h, z9.h\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmla z28.h, p3/M, z7.h, z12.h\n"
- "fmla z27.h, p3/M, z6.h, z12.h\n"
- "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
- "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
- "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
- "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
- "fmla z30.h, p3/M, z6.h, z9.h\n"
- "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
- "fmla z13.h, p3/M, z4.h, z9.h\n"
- "fmla z20.h, p3/M, z3.h, z9.h\n"
- "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
- "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
- "fmla z17.h, p3/M, z8.h, z9.h\n"
- "fmla z26.h, p3/M, z5.h, z9.h\n"
- "fmla z10.h, p3/M, z2.h, z9.h\n"
- "fmla z14.h, p3/M, z8.h, z29.h\n"
- "ld1h { z9.h }, p2/Z, [x13]\n"
- "fmla z31.h, p3/M, z1.h, z22.h\n"
- "fmla z18.h, p3/M, z0.h, z22.h\n"
- "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
- "fmla z28.h, p3/M, z2.h, z21.h\n"
- "fmla z27.h, p3/M, z1.h, z21.h\n"
- "ld1h { z19.h }, p2/Z, [x28]\n"
- "fmla z30.h, p3/M, z7.h, z29.h\n"
- "fmla z11.h, p3/M, z6.h, z29.h\n"
- "fmla z13.h, p3/M, z5.h, z29.h\n"
- "fmla z20.h, p3/M, z4.h, z29.h\n"
- "fmla z25.h, p3/M, z3.h, z29.h\n"
- "fmla z12.h, p3/M, z2.h, z29.h\n"
- "fmla z23.h, p3/M, z1.h, z29.h\n"
- "fmla z24.h, p3/M, z0.h, z29.h\n"
- "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
- "fmla z17.h, p3/M, z0.h, z9.h\n"
- "fmla z26.h, p3/M, z6.h, z19.h\n"
- "fmla z10.h, p3/M, z3.h, z19.h\n"
- "fmla z14.h, p3/M, z1.h, z21.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z9.h\n"
- "fmla z27.h, p3/M, z5.h, z22.h\n"
- "fmla z11.h, p3/M, z2.h, z22.h\n"
- "fmla z18.h, p3/M, z4.h, z21.h\n"
- "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
- "fmla z28.h, p3/M, z3.h, z21.h\n"
- "fmla z30.h, p3/M, z0.h, z21.h\n"
- "fmla z25.h, p3/M, z8.h, z19.h\n"
- "fmla z24.h, p3/M, z5.h, z19.h\n"
- "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
- "fmla z17.h, p3/M, z2.h, z21.h\n"
- "fmla z14.h, p3/M, z2.h, z29.h\n"
- "fmla z31.h, p3/M, z5.h, z21.h\n"
- "fmla z18.h, p3/M, z5.h, z29.h\n"
- "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
- "fmla z28.h, p3/M, z4.h, z29.h\n"
- "fmla z27.h, p3/M, z3.h, z29.h\n"
- "fmla z30.h, p3/M, z1.h, z29.h\n"
- "fmla z11.h, p3/M, z0.h, z29.h\n"
- "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
- "fmla z10.h, p3/M, z7.h, z19.h\n"
- "fmla z12.h, p3/M, z6.h, z19.h\n"
- "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
- "fmla z17.h, p3/M, z4.h, z22.h\n"
- "fmla z14.h, p3/M, z3.h, z22.h\n"
- "fmla z26.h, p3/M, z1.h, z22.h\n"
- "fmla z13.h, p3/M, z0.h, z22.h\n"
- "fmla z31.h, p3/M, z7.h, z22.h\n"
- "fmla z18.h, p3/M, z6.h, z22.h\n"
- "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
- "fmla z23.h, p3/M, z8.h, z19.h\n"
- "fmla z24.h, p3/M, z7.h, z19.h\n"
- "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
- "fmla z28.h, p3/M, z8.h, z21.h\n"
- "fmla z27.h, p3/M, z7.h, z21.h\n"
- "fmla z30.h, p3/M, z5.h, z21.h\n"
- "fmla z11.h, p3/M, z4.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z12.h\n"
+ "movprfx z22, z13\n fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z14.h, p3/M, z7.h, z15.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "cmp x1, x21\n"
+ "movprfx z31, z13\n fmla z31.h, p3/M, z3.h, z12.h\n"
+ "movprfx z11, z13\n fmla z11.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x7, x4, LSL #1]\n"
+ "movprfx z12, z13\n fmla z12.h, p3/M, z8.h, z24.h\n"
+ "fmla z23.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z17.h }, p2/Z, [x7, x12, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z15.h\n"
+ "fmla z20.h, p3/M, z3.h, z15.h\n"
+ "movprfx z24, z13\n fmla z24.h, p3/M, z1.h, z15.h\n"
+ "fmla z13.h, p3/M, z0.h, z15.h\n"
+ "fmla z26.h, p3/M, z8.h, z15.h\n"
+ "fmla z28.h, p3/M, z5.h, z15.h\n"
+ "fmla z22.h, p3/M, z2.h, z15.h\n"
+ "fmla z14.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z15.h }, p2/Z, [x10]\n"
+ "fmla z18.h, p3/M, z1.h, z16.h\n"
+ "fmla z25.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z21.h }, p2/Z, [x10, x11, LSL #1]\n"
+ "fmla z19.h, p3/M, z2.h, z17.h\n"
+ "fmla z9.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x27]\n"
+ "fmla z23.h, p3/M, z7.h, z10.h\n"
+ "fmla z31.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z11.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z13.h, p3/M, z1.h, z10.h\n"
+ "fmla z12.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z17.h }, p2/Z, [x10, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z15.h\n"
+ "fmla z28.h, p3/M, z6.h, z16.h\n"
+ "fmla z22.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x11, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z15.h\n"
+ "fmla z14.h, p3/M, z1.h, z17.h\n"
+ "fmla z9.h, p3/M, z5.h, z21.h\n"
+ "fmla z31.h, p3/M, z2.h, z21.h\n"
+ "fmla z25.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x10, x14, LSL #1]\n"
+ "fmla z19.h, p3/M, z3.h, z17.h\n"
+ "fmla z23.h, p3/M, z0.h, z17.h\n"
+ "fmla z11.h, p3/M, z8.h, z16.h\n"
+ "fmla z12.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z17.h\n"
+ "fmla z14.h, p3/M, z2.h, z21.h\n"
+ "fmla z18.h, p3/M, z5.h, z17.h\n"
+ "fmla z25.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x4, LSL #1]\n"
+ "fmla z19.h, p3/M, z4.h, z21.h\n"
+ "fmla z9.h, p3/M, z3.h, z21.h\n"
+ "fmla z23.h, p3/M, z1.h, z21.h\n"
+ "fmla z31.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "fmla z24.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "fmla z14.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmla z30.h, p3/M, z0.h, z17.h\n"
+ "fmla z18.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x7, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z8.h, z16.h\n"
+ "fmla z12.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x4, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z21.h\n"
+ "fmla z9.h, p3/M, z7.h, z21.h\n"
+ "fmla z23.h, p3/M, z5.h, z21.h\n"
+ "fmla z31.h, p3/M, z4.h, z21.h\n"
"fmla z20.h, p3/M, z2.h, z21.h\n"
- "fmla z25.h, p3/M, z1.h, z21.h\n"
- "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
- "fmla z17.h, p3/M, z7.h, z19.h\n"
- "fmla z14.h, p3/M, z6.h, z19.h\n"
- "fmla z26.h, p3/M, z4.h, z19.h\n"
- "fmla z13.h, p3/M, z3.h, z19.h\n"
- "fmla z10.h, p3/M, z1.h, z19.h\n"
- "fmla z12.h, p3/M, z0.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
- "fmla z31.h, p3/M, z2.h, z29.h\n"
- "fmla z18.h, p3/M, z1.h, z29.h\n"
- "fmla z28.h, p3/M, z0.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x12]\n"
- "fmla z23.h, p3/M, z2.h, z21.h\n"
- "fmla z27.h, p3/M, z0.h, z22.h\n"
- "fmla z17.h, p3/M, z3.h, z29.h\n"
- "fmla z26.h, p3/M, z0.h, z29.h\n"
- "fmla z30.h, p3/M, z8.h, z21.h\n"
- "fmla z11.h, p3/M, z7.h, z21.h\n"
- "fmla z20.h, p3/M, z5.h, z21.h\n"
- "fmla z25.h, p3/M, z4.h, z21.h\n"
- "fmla z24.h, p3/M, z1.h, z21.h\n"
- "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
- "fmla z18.h, p3/M, z2.h, z22.h\n"
- "fmla z28.h, p3/M, z1.h, z22.h\n"
- "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
- "fmla z31.h, p3/M, z6.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x11]\n"
- "fmla z12.h, p3/M, z4.h, z19.h\n"
- "fmla z23.h, p3/M, z3.h, z19.h\n"
- "fmla z27.h, p3/M, z8.h, z21.h\n"
- "fmla z11.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x7, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z14.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z16.h\n"
+ "fmla z30.h, p3/M, z3.h, z16.h\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z24.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z1.h, z17.h\n"
+ "fmla z19.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x9]\n"
+ "fmla z9.h, p3/M, z0.h, z21.h\n"
+ "fmla z13.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z16.h\n"
+ "fmla z20.h, p3/M, z5.h, z16.h\n"
+ "fmla z26.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z11.h, p3/M, z4.h, z16.h\n"
+ "fmla z12.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x27, x16, LSL #1]\n"
"fmla z25.h, p3/M, z2.h, z21.h\n"
- "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
- "fmla z17.h, p3/M, z6.h, z29.h\n"
- "fmla z26.h, p3/M, z3.h, z29.h\n"
- "fmla z10.h, p3/M, z0.h, z29.h\n"
- "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
- "fmla z24.h, p3/M, z2.h, z9.h\n"
- "fmla z12.h, p3/M, z7.h, z22.h\n"
- "fmla z23.h, p3/M, z6.h, z22.h\n"
- "fmla z26.h, p3/M, z8.h, z19.h\n"
- "fmla z13.h, p3/M, z7.h, z19.h\n"
- "fmla z20.h, p3/M, z6.h, z19.h\n"
- "fmla z10.h, p3/M, z5.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
- "fmla z25.h, p3/M, z5.h, z9.h\n"
- "fmla z12.h, p3/M, z5.h, z21.h\n"
- "fmla z23.h, p3/M, z4.h, z21.h\n"
- "fmla z24.h, p3/M, z3.h, z21.h\n"
- "fmla z11.h, p3/M, z8.h, z9.h\n"
- "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
- "fmla z10.h, p3/M, z8.h, z22.h\n"
- "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
- "fmla z13.h, p3/M, z8.h, z21.h\n"
- "fmla z20.h, p3/M, z7.h, z21.h\n"
- "fmla z25.h, p3/M, z6.h, z21.h\n"
- "fmla z12.h, p3/M, z8.h, z19.h\n"
- "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
- "fmla z23.h, p3/M, z7.h, z19.h\n"
- "fmla z24.h, p3/M, z6.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
- "fmla z31.h, p3/M, z4.h, z22.h\n"
- "fmla z18.h, p3/M, z3.h, z22.h\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
- "fmax z18.h, p3/M, z18.h, z15.h\n"
- "fmla z17.h, p3/M, z1.h, z22.h\n"
- "fmla z14.h, p3/M, z0.h, z22.h\n"
- "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
- "fmax z17.h, p3/M, z17.h, z15.h\n"
- "fmla z28.h, p3/M, z5.h, z29.h\n"
- "fmla z27.h, p3/M, z4.h, z29.h\n"
- "fmax z28.h, p3/M, z28.h, z15.h\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
- "fmla z30.h, p3/M, z2.h, z29.h\n"
- "fmla z11.h, p3/M, z1.h, z29.h\n"
- "fmax z14.h, p3/M, z14.h, z15.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmla z26.h, p3/M, z7.h, z21.h\n"
+ "fmla z19.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x11, LSL #1]\n"
+ "fmla z18.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z13.h, p3/M, z3.h, z15.h\n"
+ "fmla z30.h, p3/M, z7.h, z15.h\n"
+ "fmla z9.h, p3/M, z8.h, z16.h\n"
+ "fmla z31.h, p3/M, z5.h, z16.h\n"
+ "fmla z11.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x11, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z17.h\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z20.h, p3/M, z6.h, z15.h\n"
+ "fmla z12.h, p3/M, z2.h, z16.h\n"
+ "fmla z31.h, p3/M, z8.h, z16.h\n"
+ "fmla z24.h, p3/M, z7.h, z21.h\n"
"fmla z13.h, p3/M, z6.h, z21.h\n"
- "fmax z11.h, p3/M, z11.h, z15.h\n"
- "fmax z26.h, p3/M, z26.h, z15.h\n"
- "fmla z10.h, p3/M, z4.h, z21.h\n"
- "fmla z12.h, p3/M, z3.h, z21.h\n"
- "fmax z13.h, p3/M, z13.h, z15.h\n"
- "fmax z10.h, p3/M, z10.h, z15.h\n"
- "fmla z20.h, p3/M, z8.h, z9.h\n"
- "fmla z25.h, p3/M, z7.h, z9.h\n"
- "fmax z20.h, p3/M, z20.h, z15.h\n"
- "fmax z25.h, p3/M, z25.h, z15.h\n"
- "fmla z23.h, p3/M, z5.h, z9.h\n"
- "fmla z24.h, p3/M, z4.h, z9.h\n"
- "fmax z12.h, p3/M, z12.h, z15.h\n"
- "fmax z23.h, p3/M, z23.h, z15.h\n"
- "fmax z24.h, p3/M, z24.h, z15.h\n"
- "fmin z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z31.h }, p0, [x15]\n"
- "fmin z18.h, p3/M, z18.h, z16.h\n"
- "fmin z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z16.h\n"
- "fmin z17.h, p3/M, z17.h, z16.h\n"
- "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
- "fmin z14.h, p3/M, z14.h, z16.h\n"
- "fmin z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
- "fmin z11.h, p3/M, z11.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z16.h\n"
- "st1h { z17.h }, p0, [x9]\n"
- "fmin z13.h, p3/M, z13.h, z16.h\n"
- "fmin z20.h, p3/M, z20.h, z16.h\n"
- "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
- "fmin z25.h, p3/M, z25.h, z16.h\n"
- "fmin z10.h, p3/M, z10.h, z16.h\n"
- "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
- "fmin z12.h, p3/M, z12.h, z16.h\n"
- "fmin z23.h, p3/M, z23.h, z16.h\n"
- "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z16.h\n"
- "st1h { z26.h }, p0, [x26]\n"
- "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
- "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
- "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
- "st1h { z10.h }, p0, [x23]\n"
- "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
- "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
- "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+ "fmla z11.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z15.h\n"
+ "fmla z22.h, p3/M, z5.h, z15.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z16.h\n"
+ "fmla z13.h, p3/M, z4.h, z16.h\n"
+ "fmla z12.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmla z11.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z15.h }, p2/Z, [x10, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x4, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z13.h, p3/M, z7.h, z17.h\n"
+ "fmla z12.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x4, LSL #1]\n"
+ "fmla z19.h, p3/M, z5.h, z15.h\n"
+ "fmla z9.h, p3/M, z4.h, z15.h\n"
+ "fmla z18.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z3.h, z16.h\n"
+ "fmla z26.h, p3/M, z1.h, z16.h\n"
+ "fmla z14.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x12, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z15.h\n"
+ "fmla z31.h, p3/M, z1.h, z15.h\n"
+ "fmla z28.h, p3/M, z7.h, z17.h\n"
+ "fmla z30.h, p3/M, z6.h, z17.h\n"
+ "fmax z19.h, p3/M, z19.h, z27.h\n"
+ "fmax z9.h, p3/M, z9.h, z27.h\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "fmax z18.h, p3/M, z18.h, z27.h\n"
+ "fmax z25.h, p3/M, z25.h, z27.h\n"
+ "fmla z20.h, p3/M, z8.h, z16.h\n"
+ "fmla z11.h, p3/M, z7.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z27.h\n"
+ "fmax z14.h, p3/M, z14.h, z27.h\n"
+ "fmla z13.h, p3/M, z5.h, z16.h\n"
+ "fmla z12.h, p3/M, z4.h, z16.h\n"
+ "fmax z23.h, p3/M, z23.h, z27.h\n"
+ "fmax z31.h, p3/M, z31.h, z27.h\n"
+ "fmax z28.h, p3/M, z28.h, z27.h\n"
+ "fmax z30.h, p3/M, z30.h, z27.h\n"
+ "fmax z22.h, p3/M, z22.h, z27.h\n"
+ "fmax z24.h, p3/M, z24.h, z27.h\n"
+ "fmax z20.h, p3/M, z20.h, z27.h\n"
+ "fmax z11.h, p3/M, z11.h, z27.h\n"
+ "fmax z13.h, p3/M, z13.h, z27.h\n"
+ "fmax z12.h, p3/M, z12.h, z27.h\n"
+ "fmin z18.h, p3/M, z18.h, z29.h\n"
+ "fmin z25.h, p3/M, z25.h, z29.h\n"
+ "fmin z19.h, p3/M, z19.h, z29.h\n"
+ "fmin z9.h, p3/M, z9.h, z29.h\n"
+ "fmin z26.h, p3/M, z26.h, z29.h\n"
+ "fmin z14.h, p3/M, z14.h, z29.h\n"
+ "fmin z23.h, p3/M, z23.h, z29.h\n"
+ "fmin z31.h, p3/M, z31.h, z29.h\n"
+ "st1h { z18.h }, p0, [x17]\n"
+ "fmin z28.h, p3/M, z28.h, z29.h\n"
+ "fmin z30.h, p3/M, z30.h, z29.h\n"
+ "st1h { z25.h }, p0, [x17, x5, LSL #1]\n"
+ "fmin z20.h, p3/M, z20.h, z29.h\n"
+ "fmin z11.h, p3/M, z11.h, z29.h\n"
+ "st1h { z19.h }, p0, [x17, x15, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z29.h\n"
+ "fmin z24.h, p3/M, z24.h, z29.h\n"
+ "st1h { z9.h }, p0, [x17, x13, LSL #1]\n"
+ "fmin z13.h, p3/M, z13.h, z29.h\n"
+ "fmin z12.h, p3/M, z12.h, z29.h\n"
+ "st1h { z26.h }, p0, [x25]\n"
+ "st1h { z14.h }, p0, [x25, x5, LSL #1]\n"
+ "st1h { z23.h }, p0, [x25, x15, LSL #1]\n"
+ "st1h { z31.h }, p0, [x25, x13, LSL #1]\n"
+ "st1h { z28.h }, p0, [x24]\n"
+ "st1h { z30.h }, p0, [x24, x5, LSL #1]\n"
+ "st1h { z20.h }, p0, [x24, x15, LSL #1]\n"
+ "st1h { z11.h }, p0, [x24, x13, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z24.h }, p0, [x23, x5, LSL #1]\n"
+ "st1h { z13.h }, p0, [x23, x15, LSL #1]\n"
+ "st1h { z12.h }, p0, [x23, x13, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index c0be293cd7..d024ad0479 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -101,607 +101,607 @@ void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
"add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1h { z17.h }, p3/Z, [x7]\n"
- "cnth x17\n"
- "mov x16, #0x0\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "cnth x16\n"
+ "mov x15, #0x0\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z22.h }, p3/Z, [x7]\n"
"ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
"ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
- "whilelt p2.h, XZR, %x[n_channels]\n"
"ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
"ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
- "cmp x17, %x[n_channels]\n"
"ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "sub x14, XZR, x16\n"
"ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
- "sub x15, XZR, x17\n"
"ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
"addvl x7, x7, #16\n"
- "ldp x23, x22, [x8, #0x0]\n"
- "ldp x21, x20, [x8, #0x10]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
"ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
"addvl x7, x7, #-6\n"
- "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
- "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z22\n fmla z29.h, p3/M, z4.h, z9.h\n"
+ "movprfx z18, z22\n fmla z18.h, p3/M, z8.h, z9.h\n"
"ldr x27, [x8, #0x20]\n"
"ldr x24, [x8, #0x30]\n"
- "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z17, z22\n fmla z17.h, p3/M, z3.h, z9.h\n"
+ "movprfx z26, z22\n fmla z26.h, p3/M, z1.h, z9.h\n"
"ldr x23, [x8, #0x28]\n"
"ldr x22, [x8, #0x38]\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
- "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+ "movprfx z21, z22\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "movprfx z28, z22\n fmla z28.h, p3/M, z7.h, z9.h\n"
"ldr x26, [x8, #0x40]\n"
- "ldr x21, [x8, #0x48]\n"
- "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
- "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "ldr x20, [x8, #0x48]\n"
+ "movprfx z25, z22\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z30, z22\n fmla z30.h, p3/M, z5.h, z9.h\n"
"ldr x25, [x8, #0x50]\n"
- "ldr x20, [x8, #0x58]\n"
- "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
- "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
- "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x58]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "movprfx z23, z22\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x13, [x8, #0x70]\n"
- "fmla z26.h, p3/M, z0.h, z10.h\n"
- "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
- "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
- "fmla z24.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "movprfx z10, z22\n fmla z10.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
"ldr x24, [x8, #0x60]\n"
"ldr x23, [x8, #0x68]\n"
- "fmla z31.h, p3/M, z1.h, z12.h\n"
- "fmla z22.h, p3/M, z8.h, z12.h\n"
- "inch x15\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z12.h\n"
+ "inch x14\n"
"mov p1.b, p2.b\n"
- "fmla z27.h, p3/M, z7.h, z12.h\n"
- "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z12.h\n"
+ "movprfx z31, z22\n fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x28, [x8, #0x88]\n"
- "fmla z20.h, p3/M, z7.h, z25.h\n"
- "fmla z9.h, p3/M, z6.h, z12.h\n"
- "ldr x12, [x14, #0x0]\n"
- "ldr x11, [x14, #0x8]\n"
- "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
- "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z16.h\n"
+ "fmla z10.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x17, #0x0]\n"
+ "ldr x11, [x17, #0x8]\n"
+ "movprfx z15, z22\n fmla z15.h, p3/M, z3.h, z12.h\n"
+ "movprfx z20, z22\n fmla z20.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldr x22, [x8, #0x78]\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
- "fmla z24.h, p3/M, z6.h, z25.h\n"
- "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
- "ldr x21, [x8, #0x80]\n"
- "fmla z30.h, p3/M, z4.h, z25.h\n"
- "fmla z31.h, p3/M, z3.h, z25.h\n"
- "ldr x10, [x14, #0x10]\n"
- "ldr x9, [x14, #0x18]\n"
- "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
- "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
- "whilelt p0.h, x17, %x[n_channels]\n"
- "ld1h { z17.h }, p3/Z, [x7]\n"
- "fmla z14.h, p3/M, z8.h, z25.h\n"
- "fmla z23.h, p3/M, z5.h, z25.h\n"
- "fmla z15.h, p3/M, z2.h, z25.h\n"
- "fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "movprfx z24, z22\n fmla z24.h, p3/M, z8.h, z27.h\n"
+ "fmla z17.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x20, [x8, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ldr x10, [x17, #0x10]\n"
+ "ldr x9, [x17, #0x18]\n"
+ "movprfx z13, z22\n fmla z13.h, p3/M, z1.h, z16.h\n"
+ "movprfx z27, z22\n fmla z27.h, p3/M, z0.h, z16.h\n"
+ "whilelt p0.h, x16, %x[n_channels]\n"
+ "ld1h { z22.h }, p3/Z, [x7]\n"
+ "fmla z30.h, p3/M, z8.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z2.h, z16.h\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x15, LSL #1]\n"
"ldr x27, [x8, #0x90]\n"
- "fmla z22.h, p3/M, z0.h, z12.h\n"
- "fmla z27.h, p3/M, z2.h, z29.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
- "ldr x20, [x8, #0x98]\n"
- "fmla z20.h, p3/M, z8.h, z10.h\n"
- "fmla z9.h, p3/M, z1.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z0.h, z11.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x8, #0x98]\n"
+ "fmla z29.h, p3/M, z8.h, z9.h\n"
+ "fmla z10.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x26, [x8, #0xa0]\n"
- "fmla z24.h, p3/M, z7.h, z10.h\n"
- "fmla z11.h, p3/M, z6.h, z10.h\n"
- "fmla z30.h, p3/M, z5.h, z10.h\n"
- "fmla z31.h, p3/M, z4.h, z10.h\n"
- "fmla z13.h, p3/M, z3.h, z10.h\n"
- "fmla z18.h, p3/M, z2.h, z10.h\n"
- "fmla z21.h, p3/M, z1.h, z10.h\n"
- "fmla z28.h, p3/M, z0.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z9.h\n"
+ "fmla z15.h, p3/M, z6.h, z9.h\n"
+ "fmla z26.h, p3/M, z5.h, z9.h\n"
+ "fmla z21.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "fmla z13.h, p3/M, z2.h, z9.h\n"
+ "fmla z27.h, p3/M, z1.h, z9.h\n"
+ "fmla z24.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
"ldr x25, [x8, #0xa8]\n"
- "fmla z26.h, p3/M, z3.h, z25.h\n"
- "fmla z14.h, p3/M, z0.h, z25.h\n"
- "fmla z23.h, p3/M, z6.h, z29.h\n"
- "fmla z15.h, p3/M, z3.h, z29.h\n"
- "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z0.h, z16.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
"ldr x24, [x8, #0xb0]\n"
- "fmla z22.h, p3/M, z4.h, z10.h\n"
- "fmla z27.h, p3/M, z3.h, z10.h\n"
- "fmla z20.h, p3/M, z1.h, z10.h\n"
- "fmla z9.h, p3/M, z5.h, z12.h\n"
- "fmla z11.h, p3/M, z2.h, z12.h\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z9.h\n"
+ "fmla z25.h, p3/M, z3.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "fmla z10.h, p3/M, z5.h, z11.h\n"
+ "fmla z15.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldr x23, [x8, #0xb8]\n"
- "fmla z13.h, p3/M, z8.h, z25.h\n"
- "fmla z28.h, p3/M, z5.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x22, [x8, #0xc0]\n"
- "fmla z26.h, p3/M, z5.h, z10.h\n"
- "fmla z14.h, p3/M, z2.h, z10.h\n"
- "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
- "ldr x21, [x8, #0xc8]\n"
- "fmla z22.h, p3/M, z5.h, z12.h\n"
- "fmla z27.h, p3/M, z4.h, z12.h\n"
- "fmla z20.h, p3/M, z2.h, z12.h\n"
- "fmla z9.h, p3/M, z3.h, z12.h\n"
- "fmla z24.h, p3/M, z1.h, z12.h\n"
- "fmla z11.h, p3/M, z0.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z9.h\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x15, LSL #1]\n"
+ "ldr x20, [x8, #0xc8]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z10.h, p3/M, z3.h, z11.h\n"
+ "fmla z17.h, p3/M, z1.h, z11.h\n"
+ "fmla z15.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x28, [x8, #0xd8]\n"
- "fmla z15.h, p3/M, z7.h, z25.h\n"
- "fmla z18.h, p3/M, z6.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
- "ldr x20, [x8, #0xd0]\n"
- "fmla z26.h, p3/M, z7.h, z29.h\n"
- "fmla z22.h, p3/M, z6.h, z29.h\n"
- "fmla z14.h, p3/M, z4.h, z29.h\n"
- "fmla z20.h, p3/M, z3.h, z29.h\n"
- "fmla z23.h, p3/M, z1.h, z29.h\n"
- "fmla z30.h, p3/M, z0.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "fmla z13.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x21, [x8, #0xd0]\n"
+ "fmla z18.h, p3/M, z7.h, z16.h\n"
+ "fmla z28.h, p3/M, z6.h, z16.h\n"
+ "fmla z30.h, p3/M, z4.h, z16.h\n"
+ "fmla z29.h, p3/M, z3.h, z16.h\n"
+ "fmla z23.h, p3/M, z1.h, z16.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x15, LSL #1]\n"
"ldr x27, [x8, #0xe0]\n"
- "fmla z27.h, p3/M, z8.h, z10.h\n"
- "fmla z21.h, p3/M, z8.h, z25.h\n"
- "fmla z28.h, p3/M, z7.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
- "fmla z13.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z9.h\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z1.h, z9.h\n"
"ldr x26, [x8, #0xe8]\n"
- "fmla z9.h, p3/M, z7.h, z10.h\n"
- "fmla z24.h, p3/M, z5.h, z10.h\n"
- "fmla z11.h, p3/M, z4.h, z10.h\n"
- "fmla z31.h, p3/M, z2.h, z10.h\n"
- "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z7.h, z9.h\n"
+ "fmla z17.h, p3/M, z5.h, z9.h\n"
+ "fmla z15.h, p3/M, z4.h, z9.h\n"
+ "fmla z21.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x25, [x8, #0xf0]\n"
- "fmla z26.h, p3/M, z2.h, z29.h\n"
- "fmla z22.h, p3/M, z1.h, z29.h\n"
- "fmla z27.h, p3/M, z0.h, z29.h\n"
- "fmla z14.h, p3/M, z7.h, z25.h\n"
- "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z16.h\n"
+ "fmla z28.h, p3/M, z1.h, z16.h\n"
+ "fmla z25.h, p3/M, z0.h, z16.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
"ldr x24, [x8, #0xf8]\n"
- "fmla z20.h, p3/M, z6.h, z25.h\n"
- "fmla z23.h, p3/M, z4.h, z25.h\n"
- "fmla z30.h, p3/M, z3.h, z25.h\n"
- "fmla z15.h, p3/M, z1.h, z25.h\n"
- "fmla z18.h, p3/M, z0.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
- "fmla z13.h, p3/M, z4.h, z25.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
"ldr x23, [x8, #0x100]\n"
- "fmla z21.h, p3/M, z2.h, z25.h\n"
- "fmla z22.h, p3/M, z2.h, z10.h\n"
- "fmla z27.h, p3/M, z1.h, z10.h\n"
- "fmla z9.h, p3/M, z0.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
- "ldr x22, [x8, #0x108]\n"
- "fmla z26.h, p3/M, z6.h, z29.h\n"
- "fmla z14.h, p3/M, z3.h, z29.h\n"
- "fmla z23.h, p3/M, z0.h, z29.h\n"
- "fmla z24.h, p3/M, z8.h, z25.h\n"
- "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
- "ldr x21, [x8, #0x110]\n"
- "fmla z11.h, p3/M, z7.h, z25.h\n"
- "fmla z31.h, p3/M, z5.h, z25.h\n"
- "fmla z28.h, p3/M, z1.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
- "fmla z13.h, p3/M, z2.h, z12.h\n"
- "ldr x20, [x8, #0x118]\n"
- "fmla z15.h, p3/M, z0.h, z10.h\n"
- "fmla z18.h, p3/M, z4.h, z25.h\n"
- "fmla z21.h, p3/M, z3.h, z25.h\n"
- "fmla z9.h, p3/M, z8.h, z12.h\n"
- "fmla z11.h, p3/M, z5.h, z12.h\n"
- "fmla z14.h, p3/M, z6.h, z10.h\n"
- "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
- "fmla z23.h, p3/M, z3.h, z10.h\n"
- "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z25.h\n"
- "fmla z31.h, p3/M, z6.h, z25.h\n"
- "fmla z15.h, p3/M, z5.h, z25.h\n"
- "fmla z13.h, p3/M, z5.h, z12.h\n"
- "fmla z28.h, p3/M, z2.h, z12.h\n"
- "fmla z18.h, p3/M, z7.h, z29.h\n"
- "fmla z21.h, p3/M, z6.h, z29.h\n"
- "fmla z23.h, p3/M, z8.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
- "fmla z15.h, p3/M, z8.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z25.h\n"
- "fmla z31.h, p3/M, z7.h, z25.h\n"
- "fmla z13.h, p3/M, z6.h, z25.h\n"
- "fmla z18.h, p3/M, z5.h, z25.h\n"
- "fmla z21.h, p3/M, z4.h, z25.h\n"
- "fmla z28.h, p3/M, z3.h, z25.h\n"
- "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
- "ldp x27, x26, [x8, #0x0]\n"
- "fmla z11.h, p3/M, z8.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z29.h\n"
- "fmax z26.h, p3/M, z26.h, z16.h\n"
- "fmla z22.h, p3/M, z3.h, z29.h\n"
- "fmla z27.h, p3/M, z5.h, z25.h\n"
- "fmax z22.h, p3/M, z22.h, z16.h\n"
- "fmax z27.h, p3/M, z27.h, z16.h\n"
- "fmla z9.h, p3/M, z4.h, z25.h\n"
- "fmla z18.h, p3/M, z8.h, z12.h\n"
- "fmax z9.h, p3/M, z9.h, z16.h\n"
- "fmin z26.h, p3/M, z26.h, z19.h\n"
- "fmla z21.h, p3/M, z7.h, z12.h\n"
- "fmla z28.h, p3/M, z6.h, z12.h\n"
- "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
- "fmin z22.h, p3/M, z22.h, z19.h\n"
- "fmla z14.h, p3/M, z1.h, z29.h\n"
- "fmla z20.h, p3/M, z0.h, z29.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
- "fmin z27.h, p3/M, z27.h, z19.h\n"
- "fmla z24.h, p3/M, z2.h, z25.h\n"
- "fmla z11.h, p3/M, z1.h, z25.h\n"
- "fmin z9.h, p3/M, z9.h, z19.h\n"
- "fmax z14.h, p3/M, z14.h, z16.h\n"
- "fmla z23.h, p3/M, z7.h, z10.h\n"
- "fmla z30.h, p3/M, z6.h, z10.h\n"
- "fmax z20.h, p3/M, z20.h, z16.h\n"
- "fmax z24.h, p3/M, z24.h, z16.h\n"
- "fmla z31.h, p3/M, z8.h, z12.h\n"
- "fmla z13.h, p3/M, z7.h, z12.h\n"
- "fmax z11.h, p3/M, z11.h, z16.h\n"
- "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
- "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
- "ldr x23, [x14, #0x20]\n"
- "ldr x22, [x14, #0x28]\n"
- "fmla z15.h, p3/M, z4.h, z10.h\n"
- "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
- "ldr x21, [x14, #0x30]\n"
- "fmla z18.h, p3/M, z3.h, z10.h\n"
- "fmla z21.h, p3/M, z5.h, z12.h\n"
- "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
- "ldr x20, [x14, #0x38]\n"
- "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z1.h, z11.h\n"
+ "fmla z10.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x8, #0x108]\n"
+ "fmla z18.h, p3/M, z6.h, z16.h\n"
+ "fmla z30.h, p3/M, z3.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z9.h\n"
+ "fmla z27.h, p3/M, z2.h, z9.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z17.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x22, [x8, #0x110]\n"
+ "fmla z15.h, p3/M, z7.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z9.h\n"
+ "fmla z24.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x28, x15, LSL #1]\n"
+ "fmla z10.h, p3/M, z8.h, z11.h\n"
+ "ldr x21, [x8, #0x118]\n"
+ "fmla z20.h, p3/M, z2.h, z11.h\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z16.h\n"
+ "fmla z27.h, p3/M, z3.h, z16.h\n"
+ "fmla z15.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z31.h, p3/M, z5.h, z16.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z13.h, p3/M, z7.h, z9.h\n"
+ "fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z15.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z9.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z13.h, p3/M, z5.h, z16.h\n"
+ "fmla z24.h, p3/M, z3.h, z16.h\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z11.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z9.h\n"
+ "ldp x20, x26, [x8, #0x0]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "fmla z25.h, p3/M, z5.h, z11.h\n"
+ "fmla z10.h, p3/M, z4.h, z11.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z9.h }, p0/Z, [x20, x16, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "fmla z24.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmax z18.h, p3/M, z18.h, z19.h\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z15.h, p3/M, z1.h, z11.h\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "fmax z25.h, p3/M, z25.h, z19.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "fmax z10.h, p3/M, z10.h, z19.h\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
"ldp x25, x24, [x8, #0x10]\n"
- "fmin z14.h, p3/M, z14.h, z19.h\n"
- "fmin z20.h, p3/M, z20.h, z19.h\n"
- "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
- "ldr x23, [x14, #0x40]\n"
- "fmin z24.h, p3/M, z24.h, z19.h\n"
- "fmin z11.h, p3/M, z11.h, z19.h\n"
- "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
- "ldr x22, [x14, #0x48]\n"
- "fmax z23.h, p3/M, z23.h, z16.h\n"
- "fmax z30.h, p3/M, z30.h, z16.h\n"
- "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
- "ldr x21, [x14, #0x50]\n"
- "fmax z31.h, p3/M, z31.h, z16.h\n"
- "fmax z13.h, p3/M, z13.h, z16.h\n"
- "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
- "ldr x20, [x14, #0x58]\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "fmax z17.h, p3/M, z17.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "fmla z24.h, p3/M, z4.h, z16.h\n"
+ "fmin z10.h, p3/M, z10.h, z14.h\n"
+ "fmax z15.h, p3/M, z15.h, z19.h\n"
+ "st1h { z18.h }, p1, [x12, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x20]\n"
+ "st1h { z28.h }, p1, [x11, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x28]\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "st1h { z25.h }, p1, [x10, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x30]\n"
+ "fmin z17.h, p3/M, z17.h, z14.h\n"
+ "fmax z23.h, p3/M, z23.h, z19.h\n"
+ "st1h { z10.h }, p1, [x9, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x38]\n"
+ "fmin z15.h, p3/M, z15.h, z14.h\n"
+ "fmax z26.h, p3/M, z26.h, z19.h\n"
+ "fmax z21.h, p3/M, z21.h, z19.h\n"
+ "fmax z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z30.h }, p1, [x23, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x40]\n"
+ "st1h { z29.h }, p1, [x22, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x48]\n"
+ "inch x15\n"
+ "ld1h { z10.h }, p0/Z, [x26, x16, LSL #1]\n"
+ "st1h { z17.h }, p1, [x21, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x50]\n"
+ "ld1h { z11.h }, p0/Z, [x25, x16, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "st1h { z15.h }, p1, [x20, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "ld1h { z12.h }, p0/Z, [x24, x16, LSL #1]\n"
"inch x16\n"
- "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
- "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
- "fmin z23.h, p3/M, z23.h, z19.h\n"
- "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
- "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
- "inch x17\n"
- "fmin z30.h, p3/M, z30.h, z19.h\n"
- "fmin z31.h, p3/M, z31.h, z19.h\n"
- "fmin z13.h, p3/M, z13.h, z19.h\n"
- "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
- "ldr x23, [x14, #0x60]\n"
- "fmax z15.h, p3/M, z15.h, z16.h\n"
- "fmax z18.h, p3/M, z18.h, z16.h\n"
- "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
- "ldr x22, [x14, #0x68]\n"
- "fmax z21.h, p3/M, z21.h, z16.h\n"
- "fmax z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
- "ldr x21, [x14, #0x70]\n"
- "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
- "ldr x20, [x14, #0x78]\n"
- "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
- "whilelt p2.h, x16, %x[n_channels]\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "fmin z21.h, p3/M, z21.h, z14.h\n"
"ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
"ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
- "cmp x17, %x[n_channels]\n"
- "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "st1h { z23.h }, p1, [x23, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmax z13.h, p3/M, z13.h, z19.h\n"
+ "fmax z27.h, p3/M, z27.h, z19.h\n"
"ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
"ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
- "fmin z18.h, p3/M, z18.h, z19.h\n"
- "fmin z21.h, p3/M, z21.h, z19.h\n"
+ "fmax z24.h, p3/M, z24.h, z19.h\n"
+ "st1h { z26.h }, p1, [x22, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x68]\n"
"ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x70]\n"
"ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
"addvl x7, x7, #16\n"
- "fmin z28.h, p3/M, z28.h, z19.h\n"
- "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+ "st1h { z20.h }, p1, [x20, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "fmin z13.h, p3/M, z13.h, z14.h\n"
+ "fmin z27.h, p3/M, z27.h, z14.h\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
"ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
"ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
"addvl x7, x7, #-6\n"
- "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
- "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
- "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
+ "st1h { z31.h }, p1, [x23, x14, LSL #1]\n"
+ "st1h { z13.h }, p1, [x22, x14, LSL #1]\n"
+ "st1h { z27.h }, p1, [x21, x14, LSL #1]\n"
+ "st1h { z24.h }, p1, [x20, x14, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
- "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+ "movprfx z16, z22\n fmla z16.h, p3/M, z4.h, z9.h\n"
+ "movprfx z30, z22\n fmla z30.h, p3/M, z8.h, z9.h\n"
"ldr x27, [x8, #0x20]\n"
"ldr x24, [x8, #0x30]\n"
- "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
- "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z13, z22\n fmla z13.h, p3/M, z3.h, z9.h\n"
+ "movprfx z15, z22\n fmla z15.h, p3/M, z1.h, z9.h\n"
"ldr x23, [x8, #0x28]\n"
"ldr x22, [x8, #0x38]\n"
- "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
- "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "movprfx z20, z22\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "movprfx z18, z22\n fmla z18.h, p3/M, z7.h, z9.h\n"
"ldr x26, [x8, #0x40]\n"
"ldr x21, [x8, #0x48]\n"
- "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
- "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "movprfx z26, z22\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "movprfx z31, z22\n fmla z31.h, p3/M, z5.h, z9.h\n"
"ldr x25, [x8, #0x50]\n"
"ldr x20, [x8, #0x58]\n"
- "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
- "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
- "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z16.h, p3/M, z5.h, z12.h\n"
+ "movprfx z28, z22\n fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z27.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x13, [x8, #0x70]\n"
- "fmla z18.h, p3/M, z0.h, z10.h\n"
- "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
- "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
- "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
- "fmla z15.h, p3/M, z4.h, z12.h\n"
- "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z10.h\n"
+ "movprfx z29, z22\n fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z24.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z12.h\n"
+ "fmla z15.h, p3/M, z2.h, z12.h\n"
"ldr x24, [x8, #0x60]\n"
"ldr x23, [x8, #0x68]\n"
"fmla z20.h, p3/M, z1.h, z12.h\n"
- "fmla z13.h, p3/M, z8.h, z12.h\n"
- "inch x15\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "inch x14\n"
"mov p0.b, p2.b\n"
- "fmla z22.h, p3/M, z7.h, z12.h\n"
- "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
- "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "movprfx z9, z22\n fmla z9.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x28, [x8, #0x88]\n"
- "fmla z14.h, p3/M, z7.h, z23.h\n"
- "fmla z9.h, p3/M, z6.h, z12.h\n"
- "ldr x12, [x14, #0x0]\n"
- "ldr x11, [x14, #0x8]\n"
- "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
- "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
- "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z16.h, p3/M, z7.h, z27.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x17, #0x0]\n"
+ "ldr x11, [x17, #0x8]\n"
+ "movprfx z11, z22\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z23, z22\n fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldr x22, [x8, #0x78]\n"
- "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
- "fmla z15.h, p3/M, z6.h, z23.h\n"
- "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "movprfx z25, z22\n fmla z25.h, p3/M, z8.h, z24.h\n"
+ "fmla z13.h, p3/M, z6.h, z27.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
"ldr x21, [x8, #0x80]\n"
- "fmla z30.h, p3/M, z4.h, z23.h\n"
- "fmla z20.h, p3/M, z3.h, z23.h\n"
- "ldr x10, [x14, #0x10]\n"
- "ldr x9, [x14, #0x18]\n"
- "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
- "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
- "fmla z27.h, p3/M, z8.h, z23.h\n"
- "fmla z31.h, p3/M, z5.h, z23.h\n"
- "fmla z28.h, p3/M, z2.h, z23.h\n"
- "fmla z18.h, p3/M, z1.h, z12.h\n"
- "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z4.h, z27.h\n"
+ "fmla z20.h, p3/M, z3.h, z27.h\n"
+ "ldr x10, [x17, #0x10]\n"
+ "ldr x9, [x17, #0x18]\n"
+ "movprfx z24, z22\n fmla z24.h, p3/M, z1.h, z27.h\n"
+ "movprfx z12, z22\n fmla z12.h, p3/M, z0.h, z27.h\n"
+ "fmla z31.h, p3/M, z8.h, z27.h\n"
+ "fmla z28.h, p3/M, z5.h, z27.h\n"
+ "fmla z9.h, p3/M, z2.h, z27.h\n"
+ "fmla z30.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z27.h }, p2/Z, [x25, x15, LSL #1]\n"
"ldr x27, [x8, #0x90]\n"
- "fmla z13.h, p3/M, z0.h, z12.h\n"
- "fmla z22.h, p3/M, z2.h, z21.h\n"
- "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z0.h, z21.h\n"
+ "fmla z26.h, p3/M, z2.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x20, [x8, #0x98]\n"
- "fmla z14.h, p3/M, z8.h, z29.h\n"
- "fmla z9.h, p3/M, z1.h, z21.h\n"
- "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z16.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x26, [x8, #0xa0]\n"
- "fmla z15.h, p3/M, z7.h, z29.h\n"
- "fmla z11.h, p3/M, z6.h, z29.h\n"
- "fmla z30.h, p3/M, z5.h, z29.h\n"
- "fmla z20.h, p3/M, z4.h, z29.h\n"
- "fmla z10.h, p3/M, z3.h, z29.h\n"
- "fmla z25.h, p3/M, z2.h, z29.h\n"
- "fmla z24.h, p3/M, z1.h, z29.h\n"
- "fmla z26.h, p3/M, z0.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z7.h, z10.h\n"
+ "fmla z11.h, p3/M, z6.h, z10.h\n"
+ "fmla z15.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z2.h, z10.h\n"
+ "fmla z12.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z22.h }, p2/Z, [x23, x15, LSL #1]\n"
"ldr x25, [x8, #0xa8]\n"
- "fmla z18.h, p3/M, z3.h, z23.h\n"
- "fmla z27.h, p3/M, z0.h, z23.h\n"
- "fmla z31.h, p3/M, z6.h, z21.h\n"
- "fmla z28.h, p3/M, z3.h, z21.h\n"
- "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z27.h\n"
+ "fmla z31.h, p3/M, z0.h, z27.h\n"
+ "fmla z28.h, p3/M, z6.h, z17.h\n"
+ "fmla z9.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x13, x15, LSL #1]\n"
"ldr x24, [x8, #0xb0]\n"
- "fmla z13.h, p3/M, z4.h, z29.h\n"
- "fmla z22.h, p3/M, z3.h, z29.h\n"
- "fmla z14.h, p3/M, z1.h, z29.h\n"
- "fmla z9.h, p3/M, z5.h, z12.h\n"
- "fmla z11.h, p3/M, z2.h, z12.h\n"
- "fmla z15.h, p3/M, z0.h, z29.h\n"
- "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z22.h\n"
+ "fmla z26.h, p3/M, z3.h, z22.h\n"
+ "fmla z16.h, p3/M, z1.h, z22.h\n"
+ "fmla z29.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z2.h, z21.h\n"
+ "fmla z13.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x15, LSL #1]\n"
"ldr x23, [x8, #0xb8]\n"
- "fmla z10.h, p3/M, z8.h, z21.h\n"
- "fmla z26.h, p3/M, z5.h, z21.h\n"
- "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z17.h\n"
+ "fmla z25.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x22, [x8, #0xc0]\n"
- "fmla z18.h, p3/M, z5.h, z29.h\n"
- "fmla z27.h, p3/M, z2.h, z29.h\n"
- "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z22.h\n"
+ "fmla z31.h, p3/M, z2.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x28, x15, LSL #1]\n"
"ldr x21, [x8, #0xc8]\n"
- "fmla z13.h, p3/M, z5.h, z17.h\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z14.h, p3/M, z2.h, z17.h\n"
- "fmla z9.h, p3/M, z3.h, z17.h\n"
- "fmla z15.h, p3/M, z1.h, z17.h\n"
- "fmla z11.h, p3/M, z0.h, z17.h\n"
- "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z21.h\n"
+ "fmla z26.h, p3/M, z4.h, z21.h\n"
+ "fmla z16.h, p3/M, z2.h, z21.h\n"
+ "fmla z29.h, p3/M, z3.h, z21.h\n"
+ "fmla z13.h, p3/M, z1.h, z21.h\n"
+ "fmla z11.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x28, [x8, #0xd8]\n"
- "fmla z28.h, p3/M, z7.h, z23.h\n"
- "fmla z25.h, p3/M, z6.h, z23.h\n"
- "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z9.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x15, LSL #1]\n"
"ldr x20, [x8, #0xd0]\n"
- "fmla z18.h, p3/M, z7.h, z21.h\n"
- "fmla z13.h, p3/M, z6.h, z21.h\n"
- "fmla z27.h, p3/M, z4.h, z21.h\n"
- "fmla z14.h, p3/M, z3.h, z21.h\n"
- "fmla z31.h, p3/M, z1.h, z21.h\n"
- "fmla z30.h, p3/M, z0.h, z21.h\n"
- "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z22.h\n"
+ "fmla z18.h, p3/M, z6.h, z22.h\n"
+ "fmla z31.h, p3/M, z4.h, z22.h\n"
+ "fmla z16.h, p3/M, z3.h, z22.h\n"
+ "fmla z28.h, p3/M, z1.h, z22.h\n"
+ "fmla z15.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x26, x15, LSL #1]\n"
"ldr x27, [x8, #0xe0]\n"
- "fmla z22.h, p3/M, z8.h, z29.h\n"
- "fmla z24.h, p3/M, z8.h, z23.h\n"
- "fmla z26.h, p3/M, z7.h, z23.h\n"
- "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
- "fmla z10.h, p3/M, z1.h, z29.h\n"
+ "fmla z26.h, p3/M, z8.h, z21.h\n"
+ "fmla z12.h, p3/M, z8.h, z17.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z1.h, z21.h\n"
"ldr x26, [x8, #0xe8]\n"
- "fmla z9.h, p3/M, z7.h, z29.h\n"
- "fmla z15.h, p3/M, z5.h, z29.h\n"
- "fmla z11.h, p3/M, z4.h, z29.h\n"
- "fmla z20.h, p3/M, z2.h, z29.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z4.h, z21.h\n"
+ "fmla z20.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x15, LSL #1]\n"
"ldr x25, [x8, #0xf0]\n"
- "fmla z18.h, p3/M, z2.h, z21.h\n"
- "fmla z13.h, p3/M, z1.h, z21.h\n"
- "fmla z22.h, p3/M, z0.h, z21.h\n"
- "fmla z27.h, p3/M, z7.h, z23.h\n"
- "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z22.h\n"
+ "fmla z18.h, p3/M, z1.h, z22.h\n"
+ "fmla z26.h, p3/M, z0.h, z22.h\n"
+ "fmla z31.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z22.h }, p2/Z, [x23, x15, LSL #1]\n"
"ldr x24, [x8, #0xf8]\n"
- "fmla z14.h, p3/M, z6.h, z23.h\n"
- "fmla z31.h, p3/M, z4.h, z23.h\n"
- "fmla z30.h, p3/M, z3.h, z23.h\n"
- "fmla z28.h, p3/M, z1.h, z23.h\n"
- "fmla z25.h, p3/M, z0.h, z23.h\n"
- "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
- "fmla z10.h, p3/M, z4.h, z17.h\n"
+ "fmla z16.h, p3/M, z6.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z17.h\n"
+ "fmla z15.h, p3/M, z3.h, z17.h\n"
+ "fmla z9.h, p3/M, z1.h, z17.h\n"
+ "fmla z24.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z21.h\n"
"ldr x23, [x8, #0x100]\n"
- "fmla z24.h, p3/M, z2.h, z17.h\n"
- "fmla z13.h, p3/M, z2.h, z29.h\n"
- "fmla z22.h, p3/M, z1.h, z29.h\n"
- "fmla z9.h, p3/M, z0.h, z29.h\n"
- "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z21.h\n"
+ "fmla z29.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x21, x15, LSL #1]\n"
"ldr x22, [x8, #0x108]\n"
- "fmla z18.h, p3/M, z6.h, z21.h\n"
- "fmla z27.h, p3/M, z3.h, z21.h\n"
- "fmla z31.h, p3/M, z0.h, z21.h\n"
- "fmla z15.h, p3/M, z8.h, z17.h\n"
- "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z22.h\n"
+ "fmla z31.h, p3/M, z3.h, z22.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "fmla z12.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z0.h, z22.h\n"
+ "fmla z13.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z22.h }, p2/Z, [x20, x15, LSL #1]\n"
"ldr x21, [x8, #0x110]\n"
"fmla z11.h, p3/M, z7.h, z17.h\n"
"fmla z20.h, p3/M, z5.h, z17.h\n"
- "fmla z26.h, p3/M, z1.h, z17.h\n"
- "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
- "fmla z10.h, p3/M, z2.h, z23.h\n"
+ "fmla z25.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z21.h\n"
"ldr x20, [x8, #0x118]\n"
- "fmla z28.h, p3/M, z0.h, z29.h\n"
- "fmla z25.h, p3/M, z4.h, z21.h\n"
- "fmla z24.h, p3/M, z3.h, z21.h\n"
- "fmla z9.h, p3/M, z8.h, z23.h\n"
- "fmla z11.h, p3/M, z5.h, z23.h\n"
- "fmla z27.h, p3/M, z6.h, z29.h\n"
- "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
- "fmla z31.h, p3/M, z3.h, z29.h\n"
- "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
- "fmla z30.h, p3/M, z7.h, z21.h\n"
- "fmla z20.h, p3/M, z6.h, z21.h\n"
- "fmla z28.h, p3/M, z5.h, z21.h\n"
- "fmla z10.h, p3/M, z5.h, z23.h\n"
- "fmla z26.h, p3/M, z2.h, z23.h\n"
- "fmla z25.h, p3/M, z7.h, z17.h\n"
- "fmla z24.h, p3/M, z6.h, z17.h\n"
- "fmla z31.h, p3/M, z8.h, z21.h\n"
- "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z21.h\n"
+ "fmla z9.h, p3/M, z0.h, z22.h\n"
+ "fmla z31.h, p3/M, z6.h, z22.h\n"
+ "fmla z28.h, p3/M, z3.h, z22.h\n"
+ "ld1h { z27.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z17.h\n"
+ "fmla z12.h, p3/M, z3.h, z17.h\n"
+ "fmla z11.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z15.h, p3/M, z7.h, z17.h\n"
+ "fmla z20.h, p3/M, z6.h, z17.h\n"
+ "fmla z9.h, p3/M, z5.h, z17.h\n"
"fmla z28.h, p3/M, z8.h, z17.h\n"
- "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
- "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z22.h\n"
+ "fmla z25.h, p3/M, z2.h, z22.h\n"
+ "fmla z24.h, p3/M, z7.h, z27.h\n"
+ "fmla z12.h, p3/M, z6.h, z27.h\n"
+ "fmla z11.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z9.h, p3/M, z8.h, z27.h\n"
+ "fmla z15.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z27.h }, p2/Z, [x23, x15, LSL #1]\n"
"fmla z20.h, p3/M, z7.h, z21.h\n"
- "fmla z10.h, p3/M, z6.h, z21.h\n"
- "fmla z25.h, p3/M, z5.h, z21.h\n"
- "fmla z24.h, p3/M, z4.h, z21.h\n"
- "fmla z26.h, p3/M, z3.h, z21.h\n"
- "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
- "fmla z11.h, p3/M, z8.h, z23.h\n"
- "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
- "fmla z18.h, p3/M, z4.h, z12.h\n"
- "fmax z18.h, p3/M, z18.h, z16.h\n"
- "fmla z13.h, p3/M, z3.h, z12.h\n"
- "fmla z22.h, p3/M, z5.h, z21.h\n"
- "fmax z13.h, p3/M, z13.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z16.h\n"
- "fmla z9.h, p3/M, z4.h, z21.h\n"
- "fmla z25.h, p3/M, z8.h, z29.h\n"
- "fmax z9.h, p3/M, z9.h, z16.h\n"
- "fmin z18.h, p3/M, z18.h, z19.h\n"
- "fmla z24.h, p3/M, z7.h, z29.h\n"
- "fmla z26.h, p3/M, z6.h, z29.h\n"
- "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
- "fmin z13.h, p3/M, z13.h, z19.h\n"
- "fmla z27.h, p3/M, z1.h, z12.h\n"
- "fmla z14.h, p3/M, z0.h, z12.h\n"
- "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
- "fmin z22.h, p3/M, z22.h, z19.h\n"
- "fmla z15.h, p3/M, z2.h, z21.h\n"
+ "fmla z23.h, p3/M, z6.h, z21.h\n"
+ "fmla z24.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z3.h, z21.h\n"
+ "fmla z12.h, p3/M, z4.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z27.h\n"
+ "fmla z18.h, p3/M, z3.h, z27.h\n"
+ "fmla z31.h, p3/M, z1.h, z27.h\n"
+ "fmla z16.h, p3/M, z0.h, z27.h\n"
+ "ld1h { z27.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "fmla z29.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z12.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmax z30.h, p3/M, z30.h, z19.h\n"
+ "fmla z13.h, p3/M, z2.h, z21.h\n"
"fmla z11.h, p3/M, z1.h, z21.h\n"
- "fmin z9.h, p3/M, z9.h, z19.h\n"
- "fmax z27.h, p3/M, z27.h, z16.h\n"
- "fmla z31.h, p3/M, z7.h, z23.h\n"
- "fmla z30.h, p3/M, z6.h, z23.h\n"
- "fmax z14.h, p3/M, z14.h, z16.h\n"
- "fmax z15.h, p3/M, z15.h, z16.h\n"
- "fmla z20.h, p3/M, z8.h, z29.h\n"
- "fmla z10.h, p3/M, z7.h, z29.h\n"
- "fmax z11.h, p3/M, z11.h, z16.h\n"
- "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
- "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
- "ldr x23, [x14, #0x20]\n"
- "ldr x22, [x14, #0x28]\n"
- "fmla z28.h, p3/M, z4.h, z23.h\n"
- "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
- "ldr x21, [x14, #0x30]\n"
- "fmla z25.h, p3/M, z3.h, z23.h\n"
- "fmla z24.h, p3/M, z5.h, z29.h\n"
- "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
- "ldr x20, [x14, #0x38]\n"
- "fmla z26.h, p3/M, z4.h, z29.h\n"
- "fmin z27.h, p3/M, z27.h, z19.h\n"
- "fmin z14.h, p3/M, z14.h, z19.h\n"
- "fmin z15.h, p3/M, z15.h, z19.h\n"
- "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
- "ldr x23, [x14, #0x40]\n"
- "fmin z11.h, p3/M, z11.h, z19.h\n"
- "fmax z31.h, p3/M, z31.h, z16.h\n"
- "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
- "ldr x22, [x14, #0x48]\n"
- "fmax z30.h, p3/M, z30.h, z16.h\n"
- "fmax z20.h, p3/M, z20.h, z16.h\n"
- "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
- "ldr x21, [x14, #0x50]\n"
- "fmax z10.h, p3/M, z10.h, z16.h\n"
- "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
- "ldr x20, [x14, #0x58]\n"
- "fmin z31.h, p3/M, z31.h, z19.h\n"
- "fmin z30.h, p3/M, z30.h, z19.h\n"
- "fmin z20.h, p3/M, z20.h, z19.h\n"
- "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
- "ldr x23, [x14, #0x60]\n"
- "fmin z10.h, p3/M, z10.h, z19.h\n"
- "fmax z28.h, p3/M, z28.h, z16.h\n"
- "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
- "ldr x22, [x14, #0x68]\n"
- "fmax z25.h, p3/M, z25.h, z16.h\n"
- "fmax z24.h, p3/M, z24.h, z16.h\n"
- "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
- "ldr x21, [x14, #0x70]\n"
- "fmax z26.h, p3/M, z26.h, z16.h\n"
- "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
- "ldr x20, [x14, #0x78]\n"
- "fmin z28.h, p3/M, z28.h, z19.h\n"
- "fmin z25.h, p3/M, z25.h, z19.h\n"
- "fmin z24.h, p3/M, z24.h, z19.h\n"
- "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
- "fmin z26.h, p3/M, z26.h, z19.h\n"
- "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
- "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
- "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
+ "fmax z18.h, p3/M, z18.h, z19.h\n"
+ "fmax z31.h, p3/M, z31.h, z19.h\n"
+ "fmax z26.h, p3/M, z26.h, z19.h\n"
+ "fmla z20.h, p3/M, z8.h, z27.h\n"
+ "fmla z23.h, p3/M, z7.h, z27.h\n"
+ "fmax z16.h, p3/M, z16.h, z19.h\n"
+ "fmax z29.h, p3/M, z29.h, z19.h\n"
+ "fmla z28.h, p3/M, z7.h, z10.h\n"
+ "fmla z15.h, p3/M, z6.h, z10.h\n"
+ "fmin z30.h, p3/M, z30.h, z14.h\n"
+ "fmin z18.h, p3/M, z18.h, z14.h\n"
+ "fmla z9.h, p3/M, z4.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmin z31.h, p3/M, z31.h, z14.h\n"
+ "fmin z26.h, p3/M, z26.h, z14.h\n"
+ "fmax z13.h, p3/M, z13.h, z19.h\n"
+ "fmla z12.h, p3/M, z5.h, z27.h\n"
+ "fmla z25.h, p3/M, z4.h, z27.h\n"
+ "fmin z29.h, p3/M, z29.h, z14.h\n"
+ "fmax z11.h, p3/M, z11.h, z19.h\n"
+ "st1h { z30.h }, p0, [x12, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x20]\n"
+ "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x28]\n"
+ "fmin z16.h, p3/M, z16.h, z14.h\n"
+ "fmax z28.h, p3/M, z28.h, z19.h\n"
+ "st1h { z26.h }, p0, [x10, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x30]\n"
+ "fmin z13.h, p3/M, z13.h, z14.h\n"
+ "fmax z15.h, p3/M, z15.h, z19.h\n"
+ "st1h { z29.h }, p0, [x9, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x38]\n"
+ "fmin z11.h, p3/M, z11.h, z14.h\n"
+ "fmax z20.h, p3/M, z20.h, z19.h\n"
+ "fmax z23.h, p3/M, z23.h, z19.h\n"
+ "st1h { z31.h }, p0, [x23, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x40]\n"
+ "fmin z28.h, p3/M, z28.h, z14.h\n"
+ "st1h { z16.h }, p0, [x22, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x48]\n"
+ "fmin z15.h, p3/M, z15.h, z14.h\n"
+ "fmax z9.h, p3/M, z9.h, z19.h\n"
+ "st1h { z13.h }, p0, [x21, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x50]\n"
+ "fmin z20.h, p3/M, z20.h, z14.h\n"
+ "fmax z24.h, p3/M, z24.h, z19.h\n"
+ "st1h { z11.h }, p0, [x20, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmin z23.h, p3/M, z23.h, z14.h\n"
+ "fmax z12.h, p3/M, z12.h, z19.h\n"
+ "fmax z25.h, p3/M, z25.h, z19.h\n"
+ "st1h { z28.h }, p0, [x23, x14, LSL #1]\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmin z9.h, p3/M, z9.h, z14.h\n"
+ "st1h { z15.h }, p0, [x22, x14, LSL #1]\n"
+ "ldr x22, [x17, #0x68]\n"
+ "fmin z24.h, p3/M, z24.h, z14.h\n"
+ "st1h { z20.h }, p0, [x21, x14, LSL #1]\n"
+ "ldr x21, [x17, #0x70]\n"
+ "fmin z12.h, p3/M, z12.h, z14.h\n"
+ "st1h { z23.h }, p0, [x20, x14, LSL #1]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmin z25.h, p3/M, z25.h, z14.h\n"
+ "st1h { z9.h }, p0, [x23, x14, LSL #1]\n"
+ "st1h { z24.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z12.h }, p0, [x21, x14, LSL #1]\n"
+ "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
: "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index 58decdba1c..187c11aa3a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,246 +88,246 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x11, #0x0\n"
- "mov x16, #0x0\n"
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x25, #0x4\n"
- "mov x24, #0x2\n"
- "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
- "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cnth x13\n"
- "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
- "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x10, x15, x15\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x12, x12, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x28, x12, x23, LSL #1\n"
- "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x25, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cnth x16\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z30.h }, p3/Z, [x11]\n"
- "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
- "add x27, x28, x23, LSL #1\n"
- "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
- "add x26, x10, x15\n"
- "add x25, x27, x23, LSL #1\n"
- "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
- "add x24, x26, x15\n"
- "add x9, x9, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "cmp x13, %x[n_channels]\n"
- "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x23, x25, x23, LSL #1\n"
- "add x22, x9, x21, LSL #1\n"
- "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x13\n"
- "ld1h { z9.h }, p2/Z, [x27, x10, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x12]\n"
- "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x12, x26, LSL #1]\n"
- "addvl x11, x11, #-6\n"
- "ld1h { z13.h }, p2/Z, [x12, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x28]\n"
- "ld1h { z15.h }, p2/Z, [x28, x15, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x7, x24\n" // offset = tile_i * ld_input_row
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x17, x17\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x21, x7, x23\n" // offset = tile_i * ld_output_row
+ "add x9, x10, x17\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x20, XZR, x16\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ld1h { z28.h }, p3/Z, [x12]\n"
+ "ld1h { z0.h }, p3/Z, [x12, #1, MUL VL]\n"
+ "add x28, x9, x17\n"
+ "ld1h { z1.h }, p3/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x12, #3, MUL VL]\n"
+ "madd x21, x8, x15, x21\n" // offset += tile_j * ld_output_col
+ "ld1h { z3.h }, p3/Z, [x12, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x12, #5, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1h { z5.h }, p3/Z, [x12, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x12, #7, MUL VL]\n"
+ "addvl x12, x12, #16\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x13, x13, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x27, x13, x24, LSL #1\n"
+ "add x26, x27, x24, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x13]\n"
+ "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "add x25, x26, x24, LSL #1\n"
+ "add x11, x11, x21, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x25, x24, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x12, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x12, #-7, MUL VL]\n"
+ "add x23, x11, x23, LSL #1\n"
+ "ld1h { z9.h }, p2/Z, [x26, x10, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x13, x9, LSL #1]\n"
+ "addvl x12, x12, #-6\n"
+ "ld1h { z13.h }, p2/Z, [x13, x28, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x27]\n"
+ "ld1h { z15.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x13, x10, LSL #1]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
- "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
- "whilelt p1.h, x13, %x[n_channels]\n"
- "inch x21\n"
+ "movprfx z27, z28\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z28\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "whilelt p1.h, x16, %x[n_channels]\n"
+ "inch x14\n"
+ "movprfx z25, z28\n fmla z25.h, p3/M, z2.h, z9.h\n"
+ "movprfx z24, z28\n fmla z24.h, p3/M, z0.h, z9.h\n"
+ "inch x16\n"
+ "mov p0.b, p2.b\n"
+ "addvl x13, x13, #1\n"
+ "ld1h { z28.h }, p3/Z, [x12]\n"
+ "inch x20\n"
"fmla z27.h, p3/M, z0.h, z10.h\n"
"fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
- "inch x13\n"
+ "ld1h { z21.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x13]\n"
"fmla z27.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x27, x10, LSL #1]\n"
+ "addvl x27, x27, #1\n"
"fmla z27.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z17.h }, p2/Z, [x25]\n"
"fmla z26.h, p3/M, z0.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x25]\n"
- "mov p0.b, p2.b\n"
+ "fmla z25.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x28, LSL #1]\n"
"fmla z27.h, p3/M, z4.h, z15.h\n"
- "fmla z26.h, p3/M, z4.h, z17.h\n"
- "ld1h { z25.h }, p2/Z, [x27]\n"
- "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x26]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x17, LSL #1]\n"
"fmla z27.h, p3/M, z2.h, z16.h\n"
- "fmla z26.h, p3/M, z5.h, z20.h\n"
- "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "addvl x12, x12, #1\n"
- "addvl x28, x28, #1\n"
- "fmla z27.h, p3/M, z5.h, z19.h\n"
- "fmla z26.h, p3/M, z3.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x11]\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "fmla z21.h, p3/M, z4.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z22.h, p3/M, z0.h, z25.h\n"
- "fmla z21.h, p3/M, z1.h, z24.h\n"
- "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
- "inch x20\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z21.h, p3/M, z5.h, z16.h\n"
- "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z27.h, p3/M, z6.h, z25.h\n"
- "fmla z22.h, p3/M, z1.h, z23.h\n"
- "ld1h { z17.h }, p2/Z, [x23]\n"
- "addvl x27, x27, #1\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z0.h }, p3/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z20.h\n"
+ "fmla z26.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x26, x28, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "ld1h { z4.h }, p3/Z, [x12, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x24]\n"
+ "fmla z26.h, p3/M, z7.h, z18.h\n"
+ "fmla z24.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z12.h }, p1/Z, [x13, x9, LSL #1]\n"
+ "fmla z25.h, p3/M, z1.h, z19.h\n"
+ "ld1h { z1.h }, p3/Z, [x12, #2, MUL VL]\n"
+ "ld1h { z9.h }, p1/Z, [x26, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
"ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z29.h\n"
- "fmla z22.h, p3/M, z6.h, z17.h\n"
- "fmla z21.h, p3/M, z3.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z22.h, p3/M, z7.h, z20.h\n"
- "fmla z21.h, p3/M, z7.h, z18.h\n"
- "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z26.h, p3/M, z7.h, z24.h\n"
- "fmla z22.h, p3/M, z5.h, z16.h\n"
- "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z21.h, p3/M, z6.h, z17.h\n"
- "fmla z26.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z29.h\n"
- "fmla z22.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z29.h\n"
- "fmax z21.h, p3/M, z21.h, z29.h\n"
- "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
- "cmp x13, %x[n_channels]\n"
- "fmin z27.h, p3/M, z27.h, z28.h\n"
- "ld1h { z10.h }, p1/Z, [x12]\n"
- "ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
- "fmin z26.h, p3/M, z26.h, z28.h\n"
- "fmin z22.h, p3/M, z22.h, z28.h\n"
- "ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z28.h\n"
"addvl x25, x25, #1\n"
- "ld1h { z14.h }, p1/Z, [x28]\n"
- "ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z23.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z20.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z30.h\n"
+ "fmla z24.h, p3/M, z2.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x12, #3, MUL VL]\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
+ "fmin z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z25.h, p3/M, z7.h, z21.h\n"
+ "ld1h { z13.h }, p1/Z, [x13, x28, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z30.h\n"
+ "fmla z24.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z3.h }, p3/Z, [x12, #4, MUL VL]\n"
+ "fmin z26.h, p3/M, z26.h, z29.h\n"
+ "st1h { z27.h }, p0, [x11]\n"
+ "fmla z25.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z5.h }, p3/Z, [x12, #6, MUL VL]\n"
+ "ld1h { z16.h }, p1/Z, [x13, x10, LSL #1]\n"
+ "st1h { z26.h }, p0, [x11, x15, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z14.h }, p1/Z, [x27]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z6.h }, p3/Z, [x12, #7, MUL VL]\n"
+ "addvl x12, x12, #16\n"
+ "ld1h { z15.h }, p1/Z, [x27, x17, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z30.h\n"
+ "ld1h { z7.h }, p3/Z, [x12, #-8, MUL VL]\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z11.h }, p1/Z, [x13, x17, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x12, #-7, MUL VL]\n"
+ "addvl x12, x12, #-6\n"
+ "fmin z25.h, p3/M, z25.h, z29.h\n"
+ "fmax z24.h, p3/M, z24.h, z30.h\n"
+ "st1h { z25.h }, p0, [x23]\n"
+ "fmin z24.h, p3/M, z24.h, z29.h\n"
+ "st1h { z24.h }, p0, [x23, x15, LSL #1]\n"
"addvl x23, x23, #1\n"
- "ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
- "st1h { z27.h }, p0, [x9]\n"
- "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
- "addvl x9, x9, #1\n"
- "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
- "addvl x11, x11, #-6\n"
- "st1h { z22.h }, p0, [x22]\n"
- "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
- "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z27, z28\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z28\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z25, z28\n fmla z25.h, p3/M, z2.h, z9.h\n"
+ "movprfx z24, z28\n fmla z24.h, p3/M, z0.h, z9.h\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x8, x8, #0x1\n"
+ "add x20, x7, #0x1\n"
"fmla z27.h, p3/M, z0.h, z10.h\n"
"fmla z26.h, p3/M, z1.h, z12.h\n"
- "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ld1h { z21.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "cmp x8, x22\n"
+ "csel x7, x7, x20, LT\n"
+ "csel x8, x8, XZR, LT\n"
"fmla z27.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
"fmla z26.h, p3/M, z2.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x27, x10, LSL #1]\n"
+ "cmp x7, x21\n"
"fmla z27.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z17.h }, p2/Z, [x25]\n"
"fmla z26.h, p3/M, z0.h, z16.h\n"
- "ld1h { z18.h }, p2/Z, [x25]\n"
- "add x16, x16, #0x1\n"
+ "fmla z25.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x28, LSL #1]\n"
"fmla z27.h, p3/M, z4.h, z15.h\n"
- "fmla z26.h, p3/M, z4.h, z17.h\n"
- "ld1h { z25.h }, p2/Z, [x27]\n"
- "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x26]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z0.h, z22.h\n"
"fmla z27.h, p3/M, z2.h, z16.h\n"
- "fmla z26.h, p3/M, z5.h, z20.h\n"
- "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
- "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
- "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "cmp x16, x20\n"
- "add x21, x11, #0x1\n"
- "fmla z27.h, p3/M, z5.h, z19.h\n"
- "fmla z26.h, p3/M, z3.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z22.h, p3/M, z3.h, z18.h\n"
- "fmla z21.h, p3/M, z4.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
- "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z22.h, p3/M, z0.h, z25.h\n"
- "fmla z21.h, p3/M, z1.h, z24.h\n"
- "csel x11, x11, x21, LT\n"
- "mov p0.b, p2.b\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z21.h, p3/M, z5.h, z16.h\n"
- "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
- "fmla z27.h, p3/M, z6.h, z25.h\n"
- "fmla z22.h, p3/M, z1.h, z23.h\n"
- "ld1h { z17.h }, p2/Z, [x23]\n"
- "csel x16, x16, XZR, LT\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "fmla z27.h, p3/M, z7.h, z23.h\n"
- "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
- "fmax z27.h, p3/M, z27.h, z29.h\n"
- "fmla z22.h, p3/M, z6.h, z17.h\n"
- "fmla z21.h, p3/M, z3.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
- "cmp x11, x20\n"
- "fmla z22.h, p3/M, z7.h, z20.h\n"
- "fmla z21.h, p3/M, z7.h, z18.h\n"
- "fmin z27.h, p3/M, z27.h, z28.h\n"
- "st1h { z27.h }, p0, [x9]\n"
- "fmla z26.h, p3/M, z7.h, z24.h\n"
- "fmla z22.h, p3/M, z5.h, z16.h\n"
- "fmla z21.h, p3/M, z6.h, z17.h\n"
- "fmla z26.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
- "fmax z26.h, p3/M, z26.h, z29.h\n"
- "fmla z22.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "fmax z22.h, p3/M, z22.h, z29.h\n"
- "fmax z21.h, p3/M, z21.h, z29.h\n"
- "fmin z26.h, p3/M, z26.h, z28.h\n"
- "fmin z22.h, p3/M, z22.h, z28.h\n"
- "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
- "fmin z21.h, p3/M, z21.h, z28.h\n"
- "st1h { z22.h }, p0, [x22]\n"
- "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z20.h\n"
+ "fmla z25.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z20.h }, p2/Z, [x24, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x24]\n"
+ "fmla z25.h, p3/M, z1.h, z18.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z27.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z21.h\n"
+ "fmla z24.h, p3/M, z5.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z30.h\n"
+ "fmla z25.h, p3/M, z7.h, z20.h\n"
+ "fmax z26.h, p3/M, z26.h, z30.h\n"
+ "fmin z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z24.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmin z26.h, p3/M, z26.h, z29.h\n"
+ "st1h { z27.h }, p0, [x11]\n"
+ "fmla z24.h, p3/M, z3.h, z19.h\n"
+ "st1h { z26.h }, p0, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmax z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z25.h, p3/M, z25.h, z29.h\n"
+ "st1h { z25.h }, p0, [x23]\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z29.h\n"
+ "st1h { z24.h }, p0, [x23, x15, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index d5fbb6baee..a4ba50b9bb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,245 +89,245 @@ void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "cnth x14\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cnth x15\n"
+ "mov x14, #0x0\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
"ldp x13, x12, [x20, #0x0]\n"
"ldp x11, x10, [x20, #0x10]\n"
- "mov x9, #0x0\n"
- "whilelt p2.h, XZR, %x[n_channels]\n"
- "ld1h { z20.h }, p3/Z, [x16]\n"
- "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
- "sub x28, XZR, x14\n"
- "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ld1h { z15.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x17]\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "sub x9, XZR, x15\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
- "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z12.h\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x20, [x15, #0x50]\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z2.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z14.h\n"
- "fmla z23.h, p3/M, z0.h, z16.h\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla z24.h, p3/M, z4.h, z15.h\n"
- "fmla z23.h, p3/M, z4.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x60]\n"
- "fmla z24.h, p3/M, z2.h, z16.h\n"
- "fmla z23.h, p3/M, z5.h, z18.h\n"
- "ldr x20, [x15, #0x80]\n"
- "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
- "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x68]\n"
- "fmla z24.h, p3/M, z5.h, z19.h\n"
- "fmla z23.h, p3/M, z3.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x88]\n"
- "fmla z22.h, p3/M, z3.h, z17.h\n"
- "fmla z21.h, p3/M, z4.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z0.h, z18.h\n"
- "fmla z21.h, p3/M, z1.h, z20.h\n"
- "ldr x21, [x15, #0x70]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z21.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z6.h, z18.h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "fmla z22.h, p3/M, z1.h, z16.h\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "fmla z24.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0xa0]\n"
- "ldr x20, [x15, #0xb0]\n"
- "fmla z22.h, p3/M, z6.h, z16.h\n"
- "fmla z21.h, p3/M, z3.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z17.h\n"
- "fmla z21.h, p3/M, z7.h, z16.h\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z5.h, z18.h\n"
- "ldr x20, [x15, #0xc0]\n"
- "fmla z21.h, p3/M, z6.h, z17.h\n"
- "fmla z23.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "whilelt p1.h, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ldr x28, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "movprfx z25, z30\n fmla z25.h, p3/M, z2.h, z9.h\n"
+ "movprfx z24, z30\n fmla z24.h, p3/M, z0.h, z9.h\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "whilelt p1.h, x15, %x[n_channels]\n"
"inch x9\n"
- "fmax z24.h, p3/M, z24.h, z26.h\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
- "fmax z23.h, p3/M, z23.h, z26.h\n"
- "fmax z22.h, p3/M, z22.h, z26.h\n"
- "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
- "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
- "fmax z21.h, p3/M, z21.h, z26.h\n"
- "inch x28\n"
- "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ldr x26, [x16, #0x70]\n"
"mov p0.b, p2.b\n"
- "whilelt p2.h, x9, %x[n_channels]\n"
- "ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
- "ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
- "fmin z24.h, p3/M, z24.h, z25.h\n"
- "fmin z23.h, p3/M, z23.h, z25.h\n"
- "ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z21.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x22, [x16, #0x88]\n"
+ "ld1h { z30.h }, p3/Z, [x17]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "fmla z25.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z23.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z22.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z25.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z25.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z20.h\n"
+ "fmla z26.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z24.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z20.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z1.h, z19.h\n"
+ "fmla z24.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "fmla z26.h, p3/M, z7.h, z18.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z23.h\n"
+ "ld1h { z19.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z21.h\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z24.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
"inch x14\n"
- "ld1h { z20.h }, p3/Z, [x16]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z22.h, p3/M, z22.h, z25.h\n"
- "fmin z21.h, p3/M, z21.h, z25.h\n"
- "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
- "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
- "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
- "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
- "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
- "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
- "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
- "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
- "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "fmla z25.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmla z24.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x26, x15, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "whilelt p2.h, x14, %x[n_channels]\n"
+ "ld1h { z12.h }, p1/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x23, x15, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "st1h { z27.h }, p0, [x13, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z14.h }, p1/Z, [x22, x15, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ld1h { z15.h }, p1/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1h { z26.h }, p0, [x12, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z11.h }, p1/Z, [x25, x15, LSL #1]\n"
+ "inch x15\n"
+ "fmax z25.h, p3/M, z25.h, z29.h\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "cmp x15, %x[n_channels]\n"
+ "fmin z25.h, p3/M, z25.h, z28.h\n"
+ "fmax z24.h, p3/M, z24.h, z29.h\n"
+ "fmin z24.h, p3/M, z24.h, z28.h\n"
+ "st1h { z25.h }, p0, [x11, x9, LSL #1]\n"
+ "st1h { z24.h }, p0, [x10, x9, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
- "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "fmla z24.h, p3/M, z0.h, z10.h\n"
- "fmla z23.h, p3/M, z1.h, z12.h\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x20, [x15, #0x50]\n"
- "fmla z24.h, p3/M, z1.h, z11.h\n"
- "fmla z23.h, p3/M, z2.h, z13.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z3.h, z14.h\n"
- "fmla z23.h, p3/M, z0.h, z16.h\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla z24.h, p3/M, z4.h, z15.h\n"
- "fmla z23.h, p3/M, z4.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x60]\n"
- "fmla z24.h, p3/M, z2.h, z16.h\n"
- "fmla z23.h, p3/M, z5.h, z18.h\n"
- "ldr x20, [x15, #0x80]\n"
- "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
- "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
- "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
- "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0x68]\n"
- "fmla z24.h, p3/M, z5.h, z19.h\n"
- "fmla z23.h, p3/M, z3.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
- "ldr x20, [x15, #0x88]\n"
- "fmla z22.h, p3/M, z3.h, z17.h\n"
- "fmla z21.h, p3/M, z4.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z0.h, z18.h\n"
- "fmla z21.h, p3/M, z1.h, z20.h\n"
- "ldr x21, [x15, #0x70]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla z22.h, p3/M, z4.h, z17.h\n"
- "fmla z21.h, p3/M, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z24.h, p3/M, z6.h, z18.h\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "fmla z22.h, p3/M, z1.h, z16.h\n"
- "fmla z21.h, p3/M, z2.h, z19.h\n"
- "fmla z24.h, p3/M, z7.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
- "ldr x21, [x15, #0xa0]\n"
- "ldr x20, [x15, #0xb0]\n"
- "fmla z22.h, p3/M, z6.h, z16.h\n"
- "fmla z21.h, p3/M, z3.h, z18.h\n"
- "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z7.h, z17.h\n"
- "fmla z21.h, p3/M, z7.h, z16.h\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla z23.h, p3/M, z7.h, z20.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z5.h, z18.h\n"
- "ldr x20, [x15, #0xc0]\n"
- "fmla z21.h, p3/M, z6.h, z17.h\n"
- "fmla z23.h, p3/M, z8.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
- "fmla z22.h, p3/M, z8.h, z17.h\n"
- "fmla z21.h, p3/M, z8.h, z16.h\n"
- "inch x28\n"
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ldr x28, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ldr x25, [x16, #0x58]\n"
+ "movprfx z25, z30\n fmla z25.h, p3/M, z2.h, z9.h\n"
+ "movprfx z24, z30\n fmla z24.h, p3/M, z0.h, z9.h\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "inch x9\n"
"mov p0.b, p2.b\n"
- "fmax z24.h, p3/M, z24.h, z26.h\n"
- "fmax z23.h, p3/M, z23.h, z26.h\n"
- "fmax z22.h, p3/M, z22.h, z26.h\n"
- "fmax z21.h, p3/M, z21.h, z26.h\n"
- "fmin z24.h, p3/M, z24.h, z25.h\n"
- "fmin z23.h, p3/M, z23.h, z25.h\n"
- "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
- "fmin z22.h, p3/M, z22.h, z25.h\n"
- "fmin z21.h, p3/M, z21.h, z25.h\n"
- "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
- "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ldr x22, [x16, #0x70]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z21.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ldr x21, [x16, #0x88]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "ldr x25, [x16, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z23.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "fmla z25.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z22.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z25.h, p3/M, z0.h, z23.h\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z27.h, p3/M, z5.h, z20.h\n"
+ "fmla z26.h, p3/M, z3.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x27, x14, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z24.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x14, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z25.h, p3/M, z1.h, z18.h\n"
+ "fmla z27.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z20.h\n"
+ "fmla z24.h, p3/M, z5.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x14, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z24.h, p3/M, z2.h, z20.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z21.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmla z24.h, p3/M, z3.h, z19.h\n"
+ "st1h { z27.h }, p0, [x13, x9, LSL #1]\n"
+ "st1h { z26.h }, p0, [x12, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z8.h, z16.h\n"
+ "fmax z25.h, p3/M, z25.h, z29.h\n"
+ "fmin z25.h, p3/M, z25.h, z28.h\n"
+ "st1h { z25.h }, p0, [x11, x9, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z29.h\n"
+ "fmin z24.h, p3/M, z24.h, z28.h\n"
+ "st1h { z24.h }, p0, [x10, x9, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index fdbee67926..5489cbd990 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,432 +88,432 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x12, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x6, #0x0\n"
+ "mov x7, #0x0\n"
"1:" // Tile loop
- "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
"mov x25, #0x2\n"
- "mov x24, #0x2\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cnth x17\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "add x15, x17, x17\n"
- "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "mov x15, #0x0\n"
"ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "cnth x12\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
- "add x11, x14, x23, LSL #1\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
- "add x9, x11, x23, LSL #1\n"
- "add x28, x15, x17\n"
+ "mul x20, x6, x24\n" // offset = tile_i * ld_input_row
+ "add x12, x8, x8\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x12, x8\n"
+ "cmp x17, %x[n_channels]\n"
"ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "whilelt p2.h, XZR, %x[n_channels]\n"
- "add x27, x9, x23, LSL #1\n"
+ "mul x22, x6, x23\n" // offset = tile_i * ld_output_row
+ "add x9, x10, x8\n"
"ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x26, x28, x17\n"
- "add x25, x27, x23, LSL #1\n"
- "ld1h { z29.h }, p3/Z, [x10]\n"
- "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
- "add x24, x26, x17\n"
- "add x13, x13, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
- "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
- "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "add x23, x25, x23, LSL #1\n"
- "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
- "add x22, x13, x21, LSL #1\n"
- "mov x21, #0x0\n"
+ "sub x21, XZR, x17\n"
+ "madd x20, x7, x8, x20\n" // offset += tile_j * ld_input_col
+ "add x28, x9, x8\n"
+ "ld1h { z29.h }, p3/Z, [x11]\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "madd x22, x7, x16, x22\n" // offset += tile_j * ld_output_col
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "addvl x11, x11, #6\n"
+ "mul x20, x20, x26\n" // offset *= kernel_stride * output_size
+ "mul x22, x22, x25\n" // offset *= output_tile_size
+ "add x14, x14, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x20, x14, x24, LSL #1\n"
+ "add x27, x20, x24, LSL #1\n"
"ld1h { z5.h }, p2/Z, [x14]\n"
- "ld1h { z6.h }, p2/Z, [x14, x17, LSL #1]\n"
- "sub x20, XZR, x12\n"
- "ld1h { z7.h }, p2/Z, [x11]\n"
- "ld1h { z8.h }, p2/Z, [x11, x17, LSL #1]\n"
- "addvl x10, x10, #6\n"
- "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
- "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
- "ld1h { z11.h }, p2/Z, [x14, x28, LSL #1]\n"
- "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
- "ld1h { z10.h }, p2/Z, [x11, x24, LSL #1]\n"
- "ld1h { z14.h }, p2/Z, [x9]\n"
+ "ld1h { z6.h }, p2/Z, [x14, x8, LSL #1]\n"
+ "add x26, x27, x24, LSL #1\n"
+ "add x25, x26, x24, LSL #1\n"
+ "ld1h { z7.h }, p2/Z, [x20]\n"
+ "ld1h { z8.h }, p2/Z, [x20, x8, LSL #1]\n"
+ "add x13, x13, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x25, x24, LSL #1\n"
+ "add x23, x13, x23, LSL #1\n"
+ "ld1h { z9.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x12, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x14, x10, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x14, x9, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x27]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
"movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
- "whilelt p1.h, x12, %x[n_channels]\n"
- "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
- "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
- "ld1h { z18.h }, p3/Z, [x10]\n"
+ "ld1h { z25.h }, p2/Z, [x20, x10, LSL #1]\n"
+ "whilelt p1.h, x17, %x[n_channels]\n"
+ "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z7.h\n"
+ "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z23.h }, p3/Z, [x11]\n"
+ "inch x15\n"
+ "inch x17\n"
+ "mov p0.b, p2.b\n"
"inch x21\n"
- "fmla z27.h, p3/M, z1.h, z6.h\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "addvl x20, x20, #1\n"
"fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
- "inch x12\n"
- "fmla z26.h, p3/M, z1.h, z8.h\n"
- "fmla z30.h, p3/M, z1.h, z13.h\n"
- "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
- "mov p0.b, p2.b\n"
- "fmla z27.h, p3/M, z2.h, z9.h\n"
- "fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z8.h\n"
+ "fmla z26.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z21.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x14, x28, LSL #1]\n"
"addvl x14, x14, #1\n"
- "fmla z26.h, p3/M, z2.h, z13.h\n"
- "fmla z30.h, p3/M, z2.h, z24.h\n"
- "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
- "addvl x11, x11, #1\n"
- "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z25.h\n"
+ "ld1h { z16.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z20.h }, p2/Z, [x27, x8, LSL #1]\n"
"fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
- "inch x20\n"
- "fmla z26.h, p3/M, z3.h, z24.h\n"
- "fmla z30.h, p3/M, z3.h, z23.h\n"
- "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z27.h, p3/M, z4.h, z12.h\n"
- "fmla z31.h, p3/M, z4.h, z16.h\n"
- "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
- "fmla z26.h, p3/M, z4.h, z23.h\n"
- "fmla z30.h, p3/M, z4.h, z10.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z27.h, p3/M, z18.h, z7.h\n"
- "fmla z31.h, p3/M, z18.h, z8.h\n"
- "ld1h { z7.h }, p1/Z, [x11]\n"
- "fmla z26.h, p3/M, z18.h, z14.h\n"
- "fmla z30.h, p3/M, z18.h, z0.h\n"
- "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z27.h, p3/M, z22.h, z8.h\n"
- "fmla z31.h, p3/M, z22.h, z13.h\n"
- "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
- "fmla z26.h, p3/M, z22.h, z0.h\n"
- "fmla z30.h, p3/M, z22.h, z19.h\n"
- "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z27.h, p3/M, z20.h, z13.h\n"
- "fmla z31.h, p3/M, z20.h, z24.h\n"
- "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
- "addvl x9, x9, #1\n"
- "fmla z26.h, p3/M, z20.h, z19.h\n"
- "fmla z30.h, p3/M, z20.h, z5.h\n"
- "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "fmla z27.h, p3/M, z17.h, z24.h\n"
- "fmla z31.h, p3/M, z17.h, z23.h\n"
- "ld1h { z25.h }, p2/Z, [x27]\n"
- "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z26.h, p3/M, z17.h, z5.h\n"
- "fmla z30.h, p3/M, z17.h, z2.h\n"
- "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z27.h, p3/M, z21.h, z23.h\n"
- "fmla z31.h, p3/M, z21.h, z10.h\n"
- "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z21.h, z2.h\n"
- "fmla z30.h, p3/M, z21.h, z3.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z27.h, p3/M, z18.h, z14.h\n"
- "fmla z31.h, p3/M, z18.h, z0.h\n"
- "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
- "fmla z26.h, p3/M, z18.h, z25.h\n"
- "fmla z30.h, p3/M, z18.h, z24.h\n"
- "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z27.h, p3/M, z8.h, z0.h\n"
- "fmla z31.h, p3/M, z8.h, z19.h\n"
- "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z26.h, p3/M, z8.h, z24.h\n"
- "fmla z30.h, p3/M, z8.h, z22.h\n"
- "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z27.h, p3/M, z16.h, z19.h\n"
- "fmla z31.h, p3/M, z16.h, z5.h\n"
- "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z25.h\n"
+ "fmla z26.h, p3/M, z3.h, z22.h\n"
+ "ld1h { z17.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x12, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z22.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z0.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z30.h, p3/M, z23.h, z7.h\n"
+ "ld1h { z7.h }, p1/Z, [x20]\n"
+ "fmla z31.h, p3/M, z23.h, z8.h\n"
+ "fmla z27.h, p3/M, z23.h, z14.h\n"
+ "fmla z26.h, p3/M, z23.h, z20.h\n"
+ "ld1h { z18.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z8.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z31.h, p3/M, z21.h, z13.h\n"
+ "fmla z27.h, p3/M, z21.h, z20.h\n"
+ "fmla z26.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z13.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x9, LSL #1]\n"
"addvl x27, x27, #1\n"
- "fmla z26.h, p3/M, z16.h, z22.h\n"
- "fmla z30.h, p3/M, z16.h, z0.h\n"
- "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z27.h, p3/M, z17.h, z5.h\n"
- "fmla z31.h, p3/M, z17.h, z2.h\n"
+ "fmla z31.h, p3/M, z16.h, z25.h\n"
+ "fmla z27.h, p3/M, z16.h, z19.h\n"
+ "fmla z26.h, p3/M, z16.h, z12.h\n"
+ "ld1h { z16.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "fmla z30.h, p3/M, z17.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x26]\n"
+ "fmla z31.h, p3/M, z17.h, z22.h\n"
+ "fmla z27.h, p3/M, z17.h, z12.h\n"
+ "ld1h { z29.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z26.h, p3/M, z17.h, z24.h\n"
+ "ld1h { z17.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "fmla z30.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z23.h }, p2/Z, [x26, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z22.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z24.h\n"
+ "fmla z26.h, p3/M, z0.h, z1.h\n"
+ "ld1h { z21.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "fmla z30.h, p3/M, z18.h, z14.h\n"
+ "ld1h { z10.h }, p2/Z, [x26, x28, LSL #1]\n"
+ "fmla z31.h, p3/M, z18.h, z20.h\n"
+ "fmla z27.h, p3/M, z18.h, z25.h\n"
+ "fmla z26.h, p3/M, z18.h, z23.h\n"
+ "ld1h { z6.h }, p3/Z, [x11, #-6, MUL VL]\n"
+ "fmla z30.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z0.h }, p2/Z, [x26, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "ld1h { z20.h }, p3/Z, [x11, #-5, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z31.h, p3/M, z16.h, z12.h\n"
+ "fmla z27.h, p3/M, z16.h, z22.h\n"
+ "fmla z26.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x11, #-4, MUL VL]\n"
+ "fmla z30.h, p3/M, z17.h, z12.h\n"
"ld1h { z16.h }, p2/Z, [x25]\n"
- "fmla z26.h, p3/M, z17.h, z0.h\n"
- "fmla z30.h, p3/M, z17.h, z19.h\n"
- "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z27.h, p3/M, z21.h, z2.h\n"
- "fmla z31.h, p3/M, z21.h, z3.h\n"
- "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
- "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
- "fmla z26.h, p3/M, z21.h, z19.h\n"
- "fmla z30.h, p3/M, z21.h, z1.h\n"
- "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z27.h, p3/M, z23.h, z25.h\n"
- "fmla z31.h, p3/M, z23.h, z24.h\n"
- "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z23.h, z16.h\n"
- "fmla z30.h, p3/M, z23.h, z4.h\n"
- "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z17.h, z24.h\n"
+ "fmla z27.h, p3/M, z17.h, z0.h\n"
+ "fmla z26.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p3/Z, [x11, #-3, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z24.h\n"
+ "ld1h { z9.h }, p2/Z, [x25, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z27.h, p3/M, z21.h, z19.h\n"
+ "fmla z26.h, p3/M, z21.h, z10.h\n"
+ "ld1h { z5.h }, p3/Z, [x11, #-2, MUL VL]\n"
+ "fmla z30.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x12, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z23.h\n"
+ "fmla z27.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z4.h }, p3/Z, [x11, #-1, MUL VL]\n"
+ "fmla z30.h, p3/M, z20.h, z23.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x10, LSL #1]\n"
"fmla z31.h, p3/M, z20.h, z22.h\n"
- "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z26.h, p3/M, z20.h, z4.h\n"
- "fmla z30.h, p3/M, z20.h, z25.h\n"
- "ld1h { z23.h }, p3/Z, [x10]\n"
- "fmla z27.h, p3/M, z18.h, z22.h\n"
- "fmla z31.h, p3/M, z18.h, z0.h\n"
- "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z27.h, p3/M, z20.h, z9.h\n"
+ "fmla z26.h, p3/M, z20.h, z25.h\n"
+ "ld1h { z23.h }, p3/Z, [x11]\n"
+ "fmla z30.h, p3/M, z18.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x28, LSL #1]\n"
"addvl x25, x25, #1\n"
- "fmla z26.h, p3/M, z18.h, z25.h\n"
- "fmla z30.h, p3/M, z18.h, z24.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z27.h, p3/M, z17.h, z0.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "fmla z27.h, p3/M, z18.h, z25.h\n"
+ "fmla z26.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z17.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x24]\n"
"fmla z31.h, p3/M, z17.h, z19.h\n"
- "ld1h { z18.h }, p2/Z, [x23]\n"
- "fmla z26.h, p3/M, z17.h, z24.h\n"
- "fmla z30.h, p3/M, z17.h, z8.h\n"
- "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z27.h, p3/M, z13.h, z19.h\n"
- "fmla z31.h, p3/M, z13.h, z1.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
- "ld1h { z14.h }, p1/Z, [x9]\n"
- "fmla z26.h, p3/M, z13.h, z8.h\n"
- "fmla z30.h, p3/M, z13.h, z22.h\n"
- "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z27.h, p3/M, z5.h, z16.h\n"
- "fmla z31.h, p3/M, z5.h, z4.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z5.h, z18.h\n"
- "fmla z30.h, p3/M, z5.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z27.h, p3/M, z23.h, z4.h\n"
+ "fmla z27.h, p3/M, z17.h, z24.h\n"
+ "fmla z26.h, p3/M, z17.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z14.h }, p1/Z, [x27]\n"
+ "fmla z27.h, p3/M, z5.h, z8.h\n"
+ "fmla z26.h, p3/M, z5.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x12, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "fmla z27.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z0.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "fmla z30.h, p3/M, z23.h, z9.h\n"
+ "ld1h { z13.h }, p1/Z, [x20, x12, LSL #1]\n"
"fmla z31.h, p3/M, z23.h, z25.h\n"
- "ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
- "fmla z26.h, p3/M, z23.h, z17.h\n"
- "fmla z30.h, p3/M, z23.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z27.h, p3/M, z21.h, z25.h\n"
- "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "fmla z27.h, p3/M, z23.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z1.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z25.h\n"
"ld1h { z5.h }, p1/Z, [x14]\n"
- "fmla z26.h, p3/M, z21.h, z16.h\n"
- "fmla z30.h, p3/M, z21.h, z18.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "fmla z27.h, p3/M, z21.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x28, LSL #1]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "cmp x17, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
+ "fmla z26.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z2.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "fmla z30.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z6.h }, p1/Z, [x14, x8, LSL #1]\n"
"fmla z31.h, p3/M, z20.h, z8.h\n"
- "addvl x10, x10, #16\n"
- "whilelt p2.h, x21, %x[n_channels]\n"
- "fmla z26.h, p3/M, z20.h, z18.h\n"
- "fmla z30.h, p3/M, z20.h, z17.h\n"
- "cmp x12, %x[n_channels]\n"
- "addvl x23, x23, #1\n"
- "fmla z27.h, p3/M, z19.h, z8.h\n"
+ "fmla z27.h, p3/M, z20.h, z18.h\n"
+ "ld1h { z11.h }, p1/Z, [x14, x10, LSL #1]\n"
+ "fmla z26.h, p3/M, z20.h, z17.h\n"
+ "ld1h { z3.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "fmla z30.h, p3/M, z19.h, z8.h\n"
+ "ld1h { z8.h }, p1/Z, [x20, x8, LSL #1]\n"
"fmla z31.h, p3/M, z19.h, z22.h\n"
- "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "ld1h { z10.h }, p1/Z, [x20, x28, LSL #1]\n"
+ "fmla z27.h, p3/M, z19.h, z17.h\n"
+ "ld1h { z12.h }, p1/Z, [x14, x9, LSL #1]\n"
+ "fmla z26.h, p3/M, z19.h, z16.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x12, LSL #1]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
"fmax z31.h, p3/M, z31.h, z15.h\n"
- "fmla z26.h, p3/M, z19.h, z17.h\n"
- "fmla z30.h, p3/M, z19.h, z16.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
"fmax z26.h, p3/M, z26.h, z15.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
"fmin z31.h, p3/M, z31.h, z28.h\n"
- "ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
- "ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
"fmin z26.h, p3/M, z26.h, z28.h\n"
- "fmin z30.h, p3/M, z30.h, z28.h\n"
- "ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
- "ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
- "ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
- "st1h { z27.h }, p0, [x13]\n"
+ "st1h { z30.h }, p0, [x13]\n"
"st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
"addvl x13, x13, #1\n"
- "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
- "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1h { z26.h }, p0, [x22]\n"
- "addvl x10, x10, #-6\n"
- "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
- "addvl x22, x22, #1\n"
+ "st1h { z27.h }, p0, [x23]\n"
+ "st1h { z26.h }, p0, [x23, x16, LSL #1]\n"
+ "addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
"movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
- "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ld1h { z22.h }, p2/Z, [x20, x10, LSL #1]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
"fmla z29.h, p3/M, z0.h, z8.h\n"
- "ld1h { z20.h }, p3/Z, [x10]\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ld1h { z20.h }, p3/Z, [x11]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x7, x7, #0x1\n"
"fmla z30.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x20, x9, LSL #1]\n"
"fmla z31.h, p3/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x20, x6, #0x1\n"
"fmla z5.h, p3/M, z1.h, z8.h\n"
"fmla z29.h, p3/M, z1.h, z13.h\n"
- "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
- "add x8, x8, #0x1\n"
+ "ld1h { z19.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "cmp x7, x22\n"
+ "csel x6, x6, x20, LT\n"
+ "csel x7, x7, XZR, LT\n"
"fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x28, LSL #1]\n"
"fmla z31.h, p3/M, z2.h, z11.h\n"
- "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
- "cmp x8, x20\n"
"fmla z5.h, p3/M, z2.h, z13.h\n"
"fmla z29.h, p3/M, z2.h, z22.h\n"
- "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
- "add x21, x12, #0x1\n"
+ "ld1h { z18.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "cmp x6, x21\n"
"fmla z30.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, x8, LSL #1]\n"
"fmla z31.h, p3/M, z3.h, z12.h\n"
- "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z5.h, p3/M, z3.h, z22.h\n"
"fmla z29.h, p3/M, z3.h, z6.h\n"
- "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
- "csel x12, x12, x21, LT\n"
+ "ld1h { z17.h }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, x12, LSL #1]\n"
"fmla z31.h, p3/M, z4.h, z16.h\n"
- "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x27, x10, LSL #1]\n"
"fmla z5.h, p3/M, z4.h, z6.h\n"
"fmla z29.h, p3/M, z4.h, z10.h\n"
- "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
- "mov p0.b, p2.b\n"
+ "ld1h { z16.h }, p3/Z, [x11, #4, MUL VL]\n"
"fmla z30.h, p3/M, z20.h, z7.h\n"
"fmla z31.h, p3/M, z20.h, z8.h\n"
- "csel x8, x8, XZR, LT\n"
- "cmp x12, x20\n"
"fmla z5.h, p3/M, z20.h, z14.h\n"
"fmla z29.h, p3/M, z20.h, z1.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1h { z21.h }, p3/Z, [x11, #5, MUL VL]\n"
"fmla z30.h, p3/M, z19.h, z8.h\n"
+ "ld1h { z26.h }, p2/Z, [x27, x28, LSL #1]\n"
"fmla z31.h, p3/M, z19.h, z13.h\n"
- "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
"fmla z5.h, p3/M, z19.h, z1.h\n"
"fmla z29.h, p3/M, z19.h, z0.h\n"
- "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "ld1h { z25.h }, p3/Z, [x11, #6, MUL VL]\n"
"fmla z30.h, p3/M, z18.h, z13.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x9, LSL #1]\n"
"fmla z31.h, p3/M, z18.h, z22.h\n"
- "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
"fmla z5.h, p3/M, z18.h, z0.h\n"
"fmla z29.h, p3/M, z18.h, z27.h\n"
- "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1h { z23.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"fmla z30.h, p3/M, z17.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x26]\n"
"fmla z31.h, p3/M, z17.h, z6.h\n"
- "ld1h { z22.h }, p2/Z, [x27]\n"
"fmla z5.h, p3/M, z17.h, z27.h\n"
"fmla z29.h, p3/M, z17.h, z24.h\n"
- "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z20.h }, p3/Z, [x11, #-8, MUL VL]\n"
"fmla z30.h, p3/M, z16.h, z6.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x8, LSL #1]\n"
"fmla z31.h, p3/M, z16.h, z10.h\n"
- "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x26, x12, LSL #1]\n"
"fmla z5.h, p3/M, z16.h, z24.h\n"
"fmla z29.h, p3/M, z16.h, z26.h\n"
- "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1h { z16.h }, p3/Z, [x11, #-7, MUL VL]\n"
"fmla z30.h, p3/M, z21.h, z14.h\n"
+ "ld1h { z19.h }, p2/Z, [x26, x28, LSL #1]\n"
"fmla z31.h, p3/M, z21.h, z1.h\n"
- "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
"fmla z5.h, p3/M, z21.h, z22.h\n"
- "fmla z29.h, p3/M, z21.h, z19.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z29.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z21.h }, p3/Z, [x11, #-6, MUL VL]\n"
"fmla z30.h, p3/M, z25.h, z1.h\n"
+ "ld1h { z8.h }, p2/Z, [x26, x10, LSL #1]\n"
"fmla z31.h, p3/M, z25.h, z0.h\n"
- "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
- "fmla z5.h, p3/M, z25.h, z19.h\n"
- "fmla z29.h, p3/M, z25.h, z18.h\n"
- "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z5.h, p3/M, z25.h, z18.h\n"
+ "fmla z29.h, p3/M, z25.h, z17.h\n"
+ "ld1h { z9.h }, p3/Z, [x11, #-5, MUL VL]\n"
"fmla z30.h, p3/M, z23.h, z0.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x9, LSL #1]\n"
"fmla z31.h, p3/M, z23.h, z27.h\n"
- "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
- "fmla z5.h, p3/M, z23.h, z18.h\n"
- "fmla z29.h, p3/M, z23.h, z7.h\n"
- "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z5.h, p3/M, z23.h, z17.h\n"
+ "fmla z29.h, p3/M, z23.h, z8.h\n"
+ "ld1h { z6.h }, p3/Z, [x11, #-4, MUL VL]\n"
"fmla z30.h, p3/M, z20.h, z27.h\n"
- "fmla z31.h, p3/M, z20.h, z24.h\n"
"ld1h { z0.h }, p2/Z, [x25]\n"
- "fmla z5.h, p3/M, z20.h, z7.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "fmla z5.h, p3/M, z20.h, z8.h\n"
"fmla z29.h, p3/M, z20.h, z11.h\n"
- "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #-3, MUL VL]\n"
"fmla z30.h, p3/M, z16.h, z24.h\n"
+ "ld1h { z2.h }, p2/Z, [x25, x8, LSL #1]\n"
"fmla z31.h, p3/M, z16.h, z26.h\n"
- "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x25, x9, LSL #1]\n"
"fmla z5.h, p3/M, z16.h, z11.h\n"
- "fmla z29.h, p3/M, z16.h, z17.h\n"
- "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z29.h, p3/M, z16.h, z19.h\n"
+ "ld1h { z16.h }, p3/Z, [x11, #-2, MUL VL]\n"
"fmla z30.h, p3/M, z21.h, z22.h\n"
- "fmla z31.h, p3/M, z21.h, z19.h\n"
- "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x25, x12, LSL #1]\n"
+ "fmla z31.h, p3/M, z21.h, z18.h\n"
"fmla z5.h, p3/M, z21.h, z0.h\n"
- "fmla z29.h, p3/M, z21.h, z3.h\n"
- "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z30.h, p3/M, z10.h, z19.h\n"
- "fmla z31.h, p3/M, z10.h, z18.h\n"
- "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
- "fmla z5.h, p3/M, z10.h, z3.h\n"
- "fmla z29.h, p3/M, z10.h, z26.h\n"
- "ld1h { z23.h }, p3/Z, [x10]\n"
- "fmla z30.h, p3/M, z6.h, z18.h\n"
- "fmla z31.h, p3/M, z6.h, z7.h\n"
- "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z29.h, p3/M, z21.h, z2.h\n"
+ "ld1h { z25.h }, p3/Z, [x11, #-1, MUL VL]\n"
+ "fmla z30.h, p3/M, z9.h, z18.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z9.h, z17.h\n"
+ "fmla z5.h, p3/M, z9.h, z2.h\n"
+ "fmla z29.h, p3/M, z9.h, z26.h\n"
+ "ld1h { z23.h }, p3/Z, [x11]\n"
+ "fmla z30.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z8.h\n"
"fmla z5.h, p3/M, z6.h, z26.h\n"
"fmla z29.h, p3/M, z6.h, z24.h\n"
- "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z30.h, p3/M, z9.h, z7.h\n"
- "fmla z31.h, p3/M, z9.h, z11.h\n"
- "ld1h { z18.h }, p2/Z, [x23]\n"
- "fmla z5.h, p3/M, z9.h, z24.h\n"
- "fmla z29.h, p3/M, z9.h, z27.h\n"
- "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z21.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z18.h }, p2/Z, [x24]\n"
+ "fmla z31.h, p3/M, z4.h, z11.h\n"
+ "fmla z5.h, p3/M, z4.h, z24.h\n"
+ "fmla z29.h, p3/M, z4.h, z27.h\n"
+ "ld1h { z20.h }, p3/Z, [x11, #2, MUL VL]\n"
"fmla z30.h, p3/M, z16.h, z11.h\n"
- "fmla z31.h, p3/M, z16.h, z17.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x8, LSL #1]\n"
+ "fmla z31.h, p3/M, z16.h, z19.h\n"
"fmla z5.h, p3/M, z16.h, z27.h\n"
"fmla z29.h, p3/M, z16.h, z22.h\n"
- "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z19.h }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z30.h, p3/M, z25.h, z0.h\n"
- "fmla z31.h, p3/M, z25.h, z3.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x24, x12, LSL #1]\n"
+ "fmla z31.h, p3/M, z25.h, z2.h\n"
"fmla z5.h, p3/M, z25.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x24, x10, LSL #1]\n"
"fmla z29.h, p3/M, z25.h, z17.h\n"
- "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
- "fmla z30.h, p3/M, z23.h, z3.h\n"
+ "fmla z30.h, p3/M, z23.h, z2.h\n"
"fmla z31.h, p3/M, z23.h, z26.h\n"
"fmla z5.h, p3/M, z23.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x24, x9, LSL #1]\n"
"fmla z29.h, p3/M, z23.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
"fmla z30.h, p3/M, z21.h, z26.h\n"
"fmla z31.h, p3/M, z21.h, z24.h\n"
"fmla z5.h, p3/M, z21.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x28, LSL #1]\n"
"fmla z29.h, p3/M, z21.h, z18.h\n"
- "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
"fmla z30.h, p3/M, z20.h, z24.h\n"
"fmla z31.h, p3/M, z20.h, z27.h\n"
"fmla z5.h, p3/M, z20.h, z18.h\n"
"fmla z29.h, p3/M, z20.h, z17.h\n"
"fmla z30.h, p3/M, z19.h, z27.h\n"
"fmla z31.h, p3/M, z19.h, z22.h\n"
- "fmax z30.h, p3/M, z30.h, z15.h\n"
- "fmax z31.h, p3/M, z31.h, z15.h\n"
"fmla z5.h, p3/M, z19.h, z17.h\n"
"fmla z29.h, p3/M, z19.h, z16.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
"fmax z5.h, p3/M, z5.h, z15.h\n"
- "fmax z29.h, p3/M, z29.h, z15.h\n"
"fmin z30.h, p3/M, z30.h, z28.h\n"
"fmin z31.h, p3/M, z31.h, z28.h\n"
- "st1h { z30.h }, p0, [x13]\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
"fmin z5.h, p3/M, z5.h, z28.h\n"
+ "st1h { z30.h }, p0, [x13]\n"
"fmin z29.h, p3/M, z29.h, z28.h\n"
"st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
- "st1h { z5.h }, p0, [x22]\n"
- "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
+ "st1h { z5.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x16, LSL #1]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 1ec0cb2cbf..0c084f5c83 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,449 +99,449 @@ void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x15, x14, [x20, #0x0]\n"
- "mov x13, #0x0\n"
- "ldp x12, x11, [x20, #0x10]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x16, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"whilelt p3.h, XZR, %x[n_channels]\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "cnth x10\n"
+ "cnth x14\n"
"ptrue p2.b\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
- "cmp x10, %x[n_channels]\n"
- "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldp x27, x26, [x16, #0x10]\n"
- "sub x28, XZR, x10\n"
- "ldp x25, x24, [x16, #0x20]\n"
- "ldp x23, x22, [x16, #0x30]\n"
- "ldp x21, x20, [x16, #0x40]\n"
- "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1h { z29.h }, p2/Z, [x9]\n"
- "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
- "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
- "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
- "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
- "addvl x9, x9, #6\n"
- "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
- "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "cmp x14, %x[n_channels]\n"
+ "sub x9, XZR, x14\n"
+ "ld1rh { z17.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z30.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z5.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "ld1h { z29.h }, p2/Z, [x15]\n"
+ "ld1h { z0.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "addvl x15, x15, #6\n"
+ "ld1h { z8.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x20, x16, LSL #1]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
- "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
- "ldr x20, [x16, #0x50]\n"
- "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
- "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
- "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
- "ldr x20, [x16, #0x58]\n"
- "ldr x21, [x16, #0x60]\n"
- "fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z27.h, p2/M, z1.h, z9.h\n"
- "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x16, #0x68]\n"
- "fmla z31.h, p2/M, z1.h, z8.h\n"
- "fmla z26.h, p2/M, z1.h, z13.h\n"
- "ld1h { z21.h }, p2/Z, [x9]\n"
- "ldr x23, [x16, #0x70]\n"
- "fmla z30.h, p2/M, z2.h, z9.h\n"
- "fmla z27.h, p2/M, z2.h, z11.h\n"
- "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z31.h, p2/M, z2.h, z13.h\n"
- "fmla z26.h, p2/M, z2.h, z5.h\n"
- "ldr x22, [x16, #0x78]\n"
- "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z27.h, p2/M, z3.h, z12.h\n"
- "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x21, [x16, #0x80]\n"
- "fmla z31.h, p2/M, z3.h, z5.h\n"
- "fmla z26.h, p2/M, z3.h, z22.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
- "ldr x20, [x16, #0x88]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z27.h, p2/M, z4.h, z20.h\n"
- "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z4.h, z22.h\n"
- "fmla z26.h, p2/M, z4.h, z10.h\n"
- "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
- "ldr x23, [x16, #0x90]\n"
- "fmla z30.h, p2/M, z21.h, z7.h\n"
- "fmla z27.h, p2/M, z21.h, z8.h\n"
- "ldr x26, [x16, #0x98]\n"
- "ldr x22, [x16, #0xa0]\n"
- "fmla z31.h, p2/M, z21.h, z14.h\n"
- "fmla z26.h, p2/M, z21.h, z11.h\n"
- "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z30.h, p2/M, z18.h, z8.h\n"
- "fmla z27.h, p2/M, z18.h, z13.h\n"
- "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla z31.h, p2/M, z18.h, z11.h\n"
- "fmla z26.h, p2/M, z18.h, z0.h\n"
- "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p2/M, z17.h, z13.h\n"
- "fmla z27.h, p2/M, z17.h, z5.h\n"
- "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0xc0]\n"
- "fmla z31.h, p2/M, z17.h, z0.h\n"
- "fmla z26.h, p2/M, z17.h, z29.h\n"
- "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "fmla z30.h, p2/M, z16.h, z5.h\n"
- "fmla z27.h, p2/M, z16.h, z22.h\n"
- "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ldr x27, [x16, #0xc8]\n"
- "fmla z31.h, p2/M, z16.h, z29.h\n"
- "fmla z26.h, p2/M, z16.h, z3.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
- "ldr x23, [x16, #0xd0]\n"
- "fmla z30.h, p2/M, z19.h, z22.h\n"
- "fmla z27.h, p2/M, z19.h, z10.h\n"
- "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z19.h, z3.h\n"
- "fmla z26.h, p2/M, z19.h, z24.h\n"
- "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
- "ldr x22, [x16, #0xd8]\n"
- "fmla z30.h, p2/M, z25.h, z14.h\n"
- "fmla z27.h, p2/M, z25.h, z11.h\n"
- "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x16, #0xe0]\n"
- "fmla z31.h, p2/M, z25.h, z6.h\n"
- "fmla z26.h, p2/M, z25.h, z23.h\n"
- "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
- "ldr x26, [x16, #0xf8]\n"
- "fmla z30.h, p2/M, z18.h, z11.h\n"
- "fmla z27.h, p2/M, z18.h, z0.h\n"
- "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z31.h, p2/M, z18.h, z23.h\n"
- "fmla z26.h, p2/M, z18.h, z22.h\n"
- "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
- "whilelt p1.h, x10, %x[n_channels]\n"
- "fmla z30.h, p2/M, z17.h, z0.h\n"
- "fmla z27.h, p2/M, z17.h, z29.h\n"
- "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla z31.h, p2/M, z17.h, z22.h\n"
- "fmla z26.h, p2/M, z17.h, z7.h\n"
- "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
- "inch x28\n"
- "fmla z30.h, p2/M, z16.h, z29.h\n"
- "fmla z27.h, p2/M, z16.h, z3.h\n"
- "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0x100]\n"
- "fmla z31.h, p2/M, z16.h, z7.h\n"
- "fmla z26.h, p2/M, z16.h, z19.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "movprfx z15, z29\n fmla z15.h, p2/M, z0.h, z5.h\n"
+ "movprfx z28, z29\n fmla z28.h, p2/M, z0.h, z6.h\n"
+ "ldr x21, [x17, #0x50]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x22, [x17, #0x60]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "ld1h { z19.h }, p2/Z, [x15]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "inch x9\n"
+ "ld1h { z25.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x78]\n"
"mov p0.b, p3.b\n"
- "fmla z30.h, p2/M, z21.h, z3.h\n"
- "fmla z27.h, p2/M, z21.h, z24.h\n"
- "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
- "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z21.h, z19.h\n"
- "fmla z26.h, p2/M, z21.h, z1.h\n"
- "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
- "ldr x20, [x16, #0x108]\n"
- "fmla z30.h, p2/M, z20.h, z6.h\n"
- "fmla z27.h, p2/M, z20.h, z23.h\n"
- "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x16, #0x110]\n"
- "fmla z31.h, p2/M, z20.h, z0.h\n"
- "fmla z26.h, p2/M, z20.h, z11.h\n"
- "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
- "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
- "fmla z30.h, p2/M, z18.h, z23.h\n"
- "fmla z27.h, p2/M, z18.h, z22.h\n"
- "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x22, [x16, #0x118]\n"
- "fmla z31.h, p2/M, z18.h, z11.h\n"
- "fmla z26.h, p2/M, z18.h, z25.h\n"
- "ld1h { z23.h }, p2/Z, [x9]\n"
- "fmla z30.h, p2/M, z17.h, z22.h\n"
- "fmla z27.h, p2/M, z17.h, z7.h\n"
- "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z17.h, z25.h\n"
- "fmla z26.h, p2/M, z17.h, z24.h\n"
- "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z16.h, z7.h\n"
- "fmla z27.h, p2/M, z16.h, z19.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z16.h, z24.h\n"
- "fmla z26.h, p2/M, z16.h, z13.h\n"
- "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.h, p2/M, z10.h, z19.h\n"
- "fmla z27.h, p2/M, z10.h, z1.h\n"
- "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z10.h, z13.h\n"
- "fmla z26.h, p2/M, z10.h, z22.h\n"
- "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
- "fmla z30.h, p2/M, z8.h, z0.h\n"
- "fmla z27.h, p2/M, z8.h, z11.h\n"
- "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z31.h, p2/M, z8.h, z18.h\n"
- "fmla z26.h, p2/M, z8.h, z17.h\n"
- "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "fmla z30.h, p2/M, z23.h, z11.h\n"
- "fmla z27.h, p2/M, z23.h, z25.h\n"
- "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
- "fmla z31.h, p2/M, z23.h, z17.h\n"
- "fmla z26.h, p2/M, z23.h, z16.h\n"
- "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
- "fmla z30.h, p2/M, z21.h, z25.h\n"
- "fmla z27.h, p2/M, z21.h, z24.h\n"
- "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
- "fmla z31.h, p2/M, z21.h, z16.h\n"
- "fmla z26.h, p2/M, z21.h, z18.h\n"
- "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldp x27, x26, [x16, #0x10]\n"
- "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z15.h, p2/M, z1.h, z6.h\n"
+ "fmla z28.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z23.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "fmla z27.h, p2/M, z1.h, z8.h\n"
+ "fmla z31.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z22.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ldr x23, [x17, #0x90]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "fmla z15.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ "fmla z28.h, p2/M, z2.h, z11.h\n"
+ "fmla z27.h, p2/M, z2.h, z13.h\n"
+ "fmla z31.h, p2/M, z2.h, z25.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z15.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z28.h, p2/M, z3.h, z12.h\n"
+ "fmla z27.h, p2/M, z3.h, z25.h\n"
+ "fmla z31.h, p2/M, z3.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z15.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z28.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z0.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xb8]\n"
+ "fmla z27.h, p2/M, z4.h, z23.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z3.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z15.h, p2/M, z19.h, z7.h\n"
+ "fmla z28.h, p2/M, z19.h, z8.h\n"
+ "fmla z27.h, p2/M, z19.h, z14.h\n"
+ "fmla z31.h, p2/M, z19.h, z2.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z15.h, p2/M, z22.h, z8.h\n"
+ "ld1h { z26.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z28.h, p2/M, z22.h, z13.h\n"
+ "fmla z27.h, p2/M, z22.h, z2.h\n"
+ "fmla z31.h, p2/M, z22.h, z1.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z15.h, p2/M, z16.h, z13.h\n"
+ "ld1h { z9.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xc0]\n"
+ "fmla z28.h, p2/M, z16.h, z25.h\n"
+ "fmla z27.h, p2/M, z16.h, z1.h\n"
+ "fmla z31.h, p2/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z15.h, p2/M, z21.h, z25.h\n"
+ "ld1h { z25.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xd0]\n"
+ "fmla z28.h, p2/M, z21.h, z23.h\n"
+ "ld1h { z29.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z27.h, p2/M, z21.h, z0.h\n"
+ "fmla z31.h, p2/M, z21.h, z9.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z15.h, p2/M, z3.h, z23.h\n"
+ "ld1h { z24.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z28.h, p2/M, z3.h, z10.h\n"
+ "ld1h { z23.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xe0]\n"
+ "fmla z27.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z26.h\n"
+ "ld1h { z22.h }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z15.h, p2/M, z20.h, z14.h\n"
+ "ld1h { z6.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z28.h, p2/M, z20.h, z2.h\n"
+ "fmla z27.h, p2/M, z20.h, z25.h\n"
+ "fmla z31.h, p2/M, z20.h, z24.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z15.h, p2/M, z19.h, z2.h\n"
+ "ld1h { z21.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z28.h, p2/M, z19.h, z1.h\n"
+ "fmla z27.h, p2/M, z19.h, z24.h\n"
+ "fmla z31.h, p2/M, z19.h, z23.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z15.h, p2/M, z18.h, z1.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z28.h, p2/M, z18.h, z0.h\n"
+ "fmla z27.h, p2/M, z18.h, z23.h\n"
+ "fmla z31.h, p2/M, z18.h, z21.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z15.h, p2/M, z16.h, z0.h\n"
+ "ld1h { z0.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x100]\n"
+ "fmla z28.h, p2/M, z16.h, z9.h\n"
+ "fmla z27.h, p2/M, z16.h, z21.h\n"
+ "fmla z31.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z15.h, p2/M, z22.h, z9.h\n"
+ "ld1h { z12.h }, p3/Z, [x28, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z28.h, p2/M, z22.h, z26.h\n"
+ "ld1h { z4.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "fmla z27.h, p2/M, z22.h, z19.h\n"
+ "fmla z31.h, p2/M, z22.h, z6.h\n"
+ "ld1h { z14.h }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z15.h, p2/M, z10.h, z25.h\n"
+ "ld1h { z26.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x110]\n"
+ "fmla z28.h, p2/M, z10.h, z24.h\n"
+ "fmla z27.h, p2/M, z10.h, z0.h\n"
+ "fmla z31.h, p2/M, z10.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z15.h, p2/M, z20.h, z24.h\n"
+ "ld1h { z25.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x118]\n"
+ "fmla z28.h, p2/M, z20.h, z23.h\n"
+ "fmla z27.h, p2/M, z20.h, z12.h\n"
+ "fmla z31.h, p2/M, z20.h, z26.h\n"
+ "ld1h { z24.h }, p2/Z, [x15]\n"
+ "fmla z15.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z23.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z18.h, z21.h\n"
+ "fmla z27.h, p2/M, z18.h, z26.h\n"
+ "fmla z31.h, p2/M, z18.h, z25.h\n"
+ "ld1h { z22.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z15.h, p2/M, z16.h, z21.h\n"
+ "ld1h { z21.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z16.h, z19.h\n"
+ "fmla z27.h, p2/M, z16.h, z25.h\n"
+ "fmla z31.h, p2/M, z16.h, z4.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z15.h, p2/M, z14.h, z19.h\n"
+ "ld1h { z19.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z14.h, z6.h\n"
+ "fmla z27.h, p2/M, z14.h, z4.h\n"
+ "fmla z31.h, p2/M, z14.h, z23.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z15.h, p2/M, z10.h, z0.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "fmla z28.h, p2/M, z10.h, z12.h\n"
+ "fmla z27.h, p2/M, z10.h, z21.h\n"
+ "ld1h { z13.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ "fmla z31.h, p2/M, z10.h, z19.h\n"
+ "ld1h { z0.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z15.h, p2/M, z24.h, z12.h\n"
+ "fmla z28.h, p2/M, z24.h, z26.h\n"
+ "fmla z27.h, p2/M, z24.h, z19.h\n"
+ "ld1h { z12.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "fmla z31.h, p2/M, z24.h, z16.h\n"
+ "ld1h { z1.h }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z15.h, p2/M, z22.h, z26.h\n"
+ "ld1h { z5.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "fmla z28.h, p2/M, z22.h, z25.h\n"
+ "fmla z27.h, p2/M, z22.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "inch x16\n"
+ "fmla z31.h, p2/M, z22.h, z13.h\n"
+ "ld1h { z2.h }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z15.h, p2/M, z20.h, z25.h\n"
+ "ld1h { z6.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "ld1h { z7.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "fmla z28.h, p2/M, z20.h, z4.h\n"
"fmla z27.h, p2/M, z20.h, z13.h\n"
- "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
- "ldp x25, x24, [x16, #0x20]\n"
- "fmla z31.h, p2/M, z20.h, z18.h\n"
- "fmla z26.h, p2/M, z20.h, z17.h\n"
- "ldp x23, x22, [x16, #0x30]\n"
- "ldp x21, x20, [x16, #0x40]\n"
- "fmla z30.h, p2/M, z19.h, z13.h\n"
- "fmla z27.h, p2/M, z19.h, z22.h\n"
- "inch x13\n"
- "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
- "fmla z31.h, p2/M, z19.h, z17.h\n"
- "fmla z26.h, p2/M, z19.h, z16.h\n"
- "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
- "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
- "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
- "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
- "fmax z30.h, p2/M, z30.h, z15.h\n"
- "fmax z27.h, p2/M, z27.h, z15.h\n"
- "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
- "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
- "fmax z31.h, p2/M, z31.h, z15.h\n"
- "fmax z26.h, p2/M, z26.h, z15.h\n"
- "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
- "inch x10\n"
- "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "whilelt p3.h, x13, %x[n_channels]\n"
- "cmp x10, %x[n_channels]\n"
- "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
- "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
- "fmin z30.h, p2/M, z30.h, z28.h\n"
- "fmin z27.h, p2/M, z27.h, z28.h\n"
- "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
- "fmin z31.h, p2/M, z31.h, z28.h\n"
- "fmin z26.h, p2/M, z26.h, z28.h\n"
- "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
- "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
- "addvl x9, x9, #-6\n"
- "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "whilelt p3.h, x16, %x[n_channels]\n"
+ "fmla z31.h, p2/M, z20.h, z12.h\n"
+ "ld1h { z3.h }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z15.h, p2/M, z18.h, z4.h\n"
+ "ld1h { z8.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "fmla z28.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z10.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "fmla z27.h, p2/M, z18.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "fmla z31.h, p2/M, z18.h, z16.h\n"
+ "ld1h { z9.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "inch x14\n"
+ "ld1h { z4.h }, p2/Z, [x15, #-7, MUL VL]\n"
+ "addvl x15, x15, #-6\n"
+ "fmax z15.h, p2/M, z15.h, z17.h\n"
+ "fmax z28.h, p2/M, z28.h, z17.h\n"
+ "fmax z27.h, p2/M, z27.h, z17.h\n"
+ "cmp x14, %x[n_channels]\n"
+ "fmax z31.h, p2/M, z31.h, z17.h\n"
+ "fmin z15.h, p2/M, z15.h, z30.h\n"
+ "fmin z28.h, p2/M, z28.h, z30.h\n"
+ "fmin z27.h, p2/M, z27.h, z30.h\n"
+ "fmin z31.h, p2/M, z31.h, z30.h\n"
+ "st1h { z15.h }, p0, [x13, x9, LSL #1]\n"
+ "st1h { z28.h }, p0, [x12, x9, LSL #1]\n"
+ "st1h { z27.h }, p0, [x11, x9, LSL #1]\n"
+ "st1h { z31.h }, p0, [x10, x9, LSL #1]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
- "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
- "ldr x20, [x16, #0x50]\n"
- "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
- "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
- "fmla z29.h, p2/M, z0.h, z8.h\n"
- "ldr x20, [x16, #0x58]\n"
- "ldr x21, [x16, #0x60]\n"
- "fmla z30.h, p2/M, z1.h, z6.h\n"
- "fmla z31.h, p2/M, z1.h, z9.h\n"
- "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x16, #0x68]\n"
- "fmla z5.h, p2/M, z1.h, z8.h\n"
- "fmla z29.h, p2/M, z1.h, z13.h\n"
- "ld1h { z20.h }, p2/Z, [x9]\n"
- "ldr x23, [x16, #0x70]\n"
- "fmla z30.h, p2/M, z2.h, z9.h\n"
- "fmla z31.h, p2/M, z2.h, z11.h\n"
- "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z5.h, p2/M, z2.h, z13.h\n"
- "fmla z29.h, p2/M, z2.h, z22.h\n"
- "ldr x21, [x16, #0x78]\n"
- "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.h, p2/M, z3.h, z11.h\n"
- "fmla z31.h, p2/M, z3.h, z12.h\n"
- "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x22, [x16, #0x80]\n"
- "fmla z5.h, p2/M, z3.h, z22.h\n"
- "fmla z29.h, p2/M, z3.h, z6.h\n"
- "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
- "ldr x20, [x16, #0x88]\n"
- "fmla z30.h, p2/M, z4.h, z12.h\n"
- "fmla z31.h, p2/M, z4.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z4.h, z6.h\n"
- "fmla z29.h, p2/M, z4.h, z10.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
- "ldr x21, [x16, #0x90]\n"
- "fmla z30.h, p2/M, z20.h, z7.h\n"
- "fmla z31.h, p2/M, z20.h, z8.h\n"
- "ldr x27, [x16, #0x98]\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla z5.h, p2/M, z20.h, z14.h\n"
- "fmla z29.h, p2/M, z20.h, z1.h\n"
- "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z30.h, p2/M, z19.h, z8.h\n"
- "fmla z31.h, p2/M, z19.h, z13.h\n"
- "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla z5.h, p2/M, z19.h, z1.h\n"
- "fmla z29.h, p2/M, z19.h, z0.h\n"
- "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.h, p2/M, z18.h, z13.h\n"
- "fmla z31.h, p2/M, z18.h, z22.h\n"
- "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ldr x23, [x16, #0xc0]\n"
- "fmla z5.h, p2/M, z18.h, z0.h\n"
- "fmla z29.h, p2/M, z18.h, z27.h\n"
- "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "fmla z30.h, p2/M, z17.h, z22.h\n"
- "fmla z31.h, p2/M, z17.h, z6.h\n"
- "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x22, [x16, #0xc8]\n"
- "fmla z5.h, p2/M, z17.h, z27.h\n"
- "fmla z29.h, p2/M, z17.h, z24.h\n"
- "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
- "ldr x21, [x16, #0xd0]\n"
- "fmla z30.h, p2/M, z16.h, z6.h\n"
- "fmla z31.h, p2/M, z16.h, z10.h\n"
- "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
- "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z16.h, z24.h\n"
- "fmla z29.h, p2/M, z16.h, z26.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
- "ldr x27, [x16, #0xd8]\n"
- "fmla z30.h, p2/M, z21.h, z14.h\n"
+ "movprfx z16, z29\n fmla z16.h, p2/M, z0.h, z5.h\n"
+ "movprfx z15, z29\n fmla z15.h, p2/M, z0.h, z6.h\n"
+ "ldr x22, [x17, #0x50]\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+ "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x17, #0x60]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "ld1h { z25.h }, p2/Z, [x15]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "inch x9\n"
+ "mov p0.b, p3.b\n"
+ "ld1h { z24.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z16.h, p2/M, z1.h, z6.h\n"
+ "fmla z15.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z23.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "fmla z31.h, p2/M, z1.h, z8.h\n"
+ "fmla z5.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x22, [x17, #0x88]\n"
+ "ldr x21, [x17, #0x90]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "fmla z16.h, p2/M, z2.h, z9.h\n"
+ "fmla z15.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xa0]\n"
+ "fmla z31.h, p2/M, z2.h, z13.h\n"
+ "fmla z5.h, p2/M, z2.h, z24.h\n"
+ "ld1h { z22.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z16.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z15.h, p2/M, z3.h, z12.h\n"
+ "fmla z31.h, p2/M, z3.h, z24.h\n"
+ "fmla z5.h, p2/M, z3.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z16.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z0.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z15.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z29.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z31.h, p2/M, z4.h, z23.h\n"
+ "fmla z5.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z16.h, p2/M, z25.h, z7.h\n"
+ "fmla z15.h, p2/M, z25.h, z8.h\n"
+ "fmla z31.h, p2/M, z25.h, z14.h\n"
+ "fmla z5.h, p2/M, z25.h, z1.h\n"
+ "ld1h { z18.h }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z16.h, p2/M, z20.h, z8.h\n"
+ "ld1h { z28.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z15.h, p2/M, z20.h, z13.h\n"
+ "fmla z31.h, p2/M, z20.h, z1.h\n"
+ "fmla z5.h, p2/M, z20.h, z0.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z16.h, p2/M, z22.h, z13.h\n"
+ "ld1h { z27.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z15.h, p2/M, z22.h, z24.h\n"
+ "fmla z31.h, p2/M, z22.h, z0.h\n"
+ "fmla z5.h, p2/M, z22.h, z29.h\n"
+ "ld1h { z26.h }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z16.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z25.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z15.h, p2/M, z21.h, z23.h\n"
+ "fmla z31.h, p2/M, z21.h, z29.h\n"
+ "fmla z5.h, p2/M, z21.h, z27.h\n"
+ "ld1h { z24.h }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z16.h, p2/M, z19.h, z23.h\n"
+ "ld1h { z23.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z15.h, p2/M, z19.h, z10.h\n"
+ "ld1h { z22.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x17, #0xe0]\n"
+ "fmla z31.h, p2/M, z19.h, z27.h\n"
+ "fmla z5.h, p2/M, z19.h, z28.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z16.h, p2/M, z18.h, z14.h\n"
+ "ld1h { z2.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z15.h, p2/M, z18.h, z1.h\n"
+ "fmla z31.h, p2/M, z18.h, z25.h\n"
+ "fmla z5.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z16.h, p2/M, z20.h, z1.h\n"
+ "ld1h { z18.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z15.h, p2/M, z20.h, z0.h\n"
+ "fmla z31.h, p2/M, z20.h, z23.h\n"
+ "fmla z5.h, p2/M, z20.h, z22.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z16.h, p2/M, z26.h, z0.h\n"
+ "ld1h { z9.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z15.h, p2/M, z26.h, z29.h\n"
+ "fmla z31.h, p2/M, z26.h, z22.h\n"
+ "fmla z5.h, p2/M, z26.h, z18.h\n"
+ "ld1h { z4.h }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z16.h, p2/M, z24.h, z29.h\n"
+ "ld1h { z1.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z15.h, p2/M, z24.h, z27.h\n"
+ "fmla z31.h, p2/M, z24.h, z18.h\n"
+ "fmla z5.h, p2/M, z24.h, z9.h\n"
+ "ld1h { z3.h }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z16.h, p2/M, z19.h, z27.h\n"
+ "ld1h { z0.h }, p3/Z, [x28, x16, LSL #1]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z15.h, p2/M, z19.h, z28.h\n"
+ "ld1h { z29.h }, p3/Z, [x20, x16, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z9.h\n"
+ "fmla z5.h, p2/M, z19.h, z2.h\n"
+ "ld1h { z19.h }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z16.h, p2/M, z21.h, z25.h\n"
+ "ld1h { z28.h }, p3/Z, [x21, x16, LSL #1]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z15.h, p2/M, z21.h, z23.h\n"
"fmla z31.h, p2/M, z21.h, z1.h\n"
- "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
- "ldr x20, [x16, #0xe0]\n"
- "fmla z5.h, p2/M, z21.h, z22.h\n"
- "fmla z29.h, p2/M, z21.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
- "ldr x26, [x16, #0xf8]\n"
- "fmla z30.h, p2/M, z25.h, z1.h\n"
- "fmla z31.h, p2/M, z25.h, z0.h\n"
- "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
- "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.h, p2/M, z21.h, z0.h\n"
+ "ld1h { z27.h }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z16.h, p2/M, z20.h, z23.h\n"
+ "ld1h { z26.h }, p3/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z15.h, p2/M, z20.h, z22.h\n"
+ "fmla z31.h, p2/M, z20.h, z0.h\n"
+ "fmla z5.h, p2/M, z20.h, z28.h\n"
+ "ld1h { z25.h }, p2/Z, [x15]\n"
+ "fmla z16.h, p2/M, z4.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x25, x16, LSL #1]\n"
+ "fmla z15.h, p2/M, z4.h, z18.h\n"
+ "fmla z31.h, p2/M, z4.h, z28.h\n"
+ "fmla z5.h, p2/M, z4.h, z26.h\n"
+ "ld1h { z23.h }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z16.h, p2/M, z3.h, z18.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x16, LSL #1]\n"
+ "fmla z15.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z26.h\n"
+ "fmla z5.h, p2/M, z3.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z16.h, p2/M, z19.h, z9.h\n"
+ "ld1h { z21.h }, p3/Z, [x26, x16, LSL #1]\n"
+ "fmla z15.h, p2/M, z19.h, z2.h\n"
+ "fmla z31.h, p2/M, z19.h, z29.h\n"
+ "fmla z5.h, p2/M, z19.h, z24.h\n"
+ "ld1h { z20.h }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z16.h, p2/M, z27.h, z1.h\n"
+ "ld1h { z19.h }, p3/Z, [x23, x16, LSL #1]\n"
+ "fmla z15.h, p2/M, z27.h, z0.h\n"
+ "fmla z31.h, p2/M, z27.h, z18.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x16, LSL #1]\n"
+ "fmla z5.h, p2/M, z27.h, z21.h\n"
+ "fmla z16.h, p2/M, z25.h, z0.h\n"
+ "fmla z15.h, p2/M, z25.h, z28.h\n"
+ "fmla z31.h, p2/M, z25.h, z21.h\n"
+ "ld1h { z21.h }, p3/Z, [x21, x16, LSL #1]\n"
"fmla z5.h, p2/M, z25.h, z19.h\n"
- "fmla z29.h, p2/M, z25.h, z18.h\n"
- "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
- "inch x28\n"
- "fmla z30.h, p2/M, z23.h, z0.h\n"
- "fmla z31.h, p2/M, z23.h, z27.h\n"
- "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
- "ldr x24, [x16, #0xf0]\n"
+ "fmla z16.h, p2/M, z23.h, z28.h\n"
+ "fmla z15.h, p2/M, z23.h, z26.h\n"
+ "fmla z31.h, p2/M, z23.h, z19.h\n"
+ "ld1h { z12.h }, p3/Z, [x20, x16, LSL #1]\n"
"fmla z5.h, p2/M, z23.h, z18.h\n"
- "fmla z29.h, p2/M, z23.h, z9.h\n"
- "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z30.h, p2/M, z20.h, z27.h\n"
- "fmla z31.h, p2/M, z20.h, z24.h\n"
- "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
- "ldr x23, [x16, #0x100]\n"
- "fmla z5.h, p2/M, z20.h, z9.h\n"
- "fmla z29.h, p2/M, z20.h, z8.h\n"
- "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
- "fmla z30.h, p2/M, z16.h, z24.h\n"
- "fmla z31.h, p2/M, z16.h, z26.h\n"
- "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
- "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z16.h, z8.h\n"
- "fmla z29.h, p2/M, z16.h, z17.h\n"
- "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
- "ldr x22, [x16, #0x108]\n"
- "fmla z30.h, p2/M, z21.h, z22.h\n"
- "fmla z31.h, p2/M, z21.h, z19.h\n"
- "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
- "ldr x21, [x16, #0x110]\n"
- "fmla z5.h, p2/M, z21.h, z10.h\n"
- "fmla z29.h, p2/M, z21.h, z0.h\n"
- "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
- "fmla z30.h, p2/M, z4.h, z19.h\n"
- "fmla z31.h, p2/M, z4.h, z18.h\n"
- "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
- "ldr x20, [x16, #0x118]\n"
- "fmla z5.h, p2/M, z4.h, z0.h\n"
- "fmla z29.h, p2/M, z4.h, z26.h\n"
- "ld1h { z23.h }, p2/Z, [x9]\n"
- "fmla z30.h, p2/M, z6.h, z18.h\n"
- "fmla z31.h, p2/M, z6.h, z9.h\n"
- "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z6.h, z26.h\n"
- "fmla z29.h, p2/M, z6.h, z24.h\n"
- "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z30.h, p2/M, z11.h, z9.h\n"
- "fmla z31.h, p2/M, z11.h, z8.h\n"
- "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z11.h, z24.h\n"
- "fmla z29.h, p2/M, z11.h, z27.h\n"
- "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.h, p2/M, z16.h, z8.h\n"
- "fmla z31.h, p2/M, z16.h, z17.h\n"
- "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z16.h, z27.h\n"
- "fmla z29.h, p2/M, z16.h, z22.h\n"
- "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
- "fmla z30.h, p2/M, z25.h, z10.h\n"
- "fmla z31.h, p2/M, z25.h, z0.h\n"
- "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
- "fmla z5.h, p2/M, z25.h, z18.h\n"
- "fmla z29.h, p2/M, z25.h, z17.h\n"
- "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z23.h, z0.h\n"
- "fmla z31.h, p2/M, z23.h, z26.h\n"
- "fmla z5.h, p2/M, z23.h, z17.h\n"
- "fmla z29.h, p2/M, z23.h, z16.h\n"
- "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z21.h, z26.h\n"
- "fmla z31.h, p2/M, z21.h, z24.h\n"
- "fmla z5.h, p2/M, z21.h, z16.h\n"
- "fmla z29.h, p2/M, z21.h, z18.h\n"
- "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
- "fmla z30.h, p2/M, z20.h, z24.h\n"
- "fmla z31.h, p2/M, z20.h, z27.h\n"
- "fmla z5.h, p2/M, z20.h, z18.h\n"
- "fmla z29.h, p2/M, z20.h, z17.h\n"
- "fmla z30.h, p2/M, z19.h, z27.h\n"
- "fmla z31.h, p2/M, z19.h, z22.h\n"
- "fmax z30.h, p2/M, z30.h, z15.h\n"
- "fmax z31.h, p2/M, z31.h, z15.h\n"
- "fmla z5.h, p2/M, z19.h, z17.h\n"
- "fmla z29.h, p2/M, z19.h, z16.h\n"
- "fmax z5.h, p2/M, z5.h, z15.h\n"
- "fmax z29.h, p2/M, z29.h, z15.h\n"
- "fmin z30.h, p2/M, z30.h, z28.h\n"
- "fmin z31.h, p2/M, z31.h, z28.h\n"
- "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
- "fmin z5.h, p2/M, z5.h, z28.h\n"
- "fmin z29.h, p2/M, z29.h, z28.h\n"
- "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
- "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
- "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ "fmla z16.h, p2/M, z22.h, z26.h\n"
+ "fmla z15.h, p2/M, z22.h, z29.h\n"
+ "fmla z31.h, p2/M, z22.h, z18.h\n"
+ "fmla z5.h, p2/M, z22.h, z21.h\n"
+ "fmla z16.h, p2/M, z20.h, z29.h\n"
+ "fmla z15.h, p2/M, z20.h, z24.h\n"
+ "fmla z31.h, p2/M, z20.h, z21.h\n"
+ "fmla z5.h, p2/M, z20.h, z12.h\n"
+ "fmax z16.h, p2/M, z16.h, z17.h\n"
+ "fmax z15.h, p2/M, z15.h, z17.h\n"
+ "fmax z31.h, p2/M, z31.h, z17.h\n"
+ "fmin z16.h, p2/M, z16.h, z30.h\n"
+ "fmin z15.h, p2/M, z15.h, z30.h\n"
+ "fmax z5.h, p2/M, z5.h, z17.h\n"
+ "fmin z31.h, p2/M, z31.h, z30.h\n"
+ "st1h { z16.h }, p0, [x13, x9, LSL #1]\n"
+ "fmin z5.h, p2/M, z5.h, z30.h\n"
+ "st1h { z15.h }, p0, [x12, x9, LSL #1]\n"
+ "st1h { z31.h }, p0, [x11, x9, LSL #1]\n"
+ "st1h { z5.h }, p0, [x10, x9, LSL #1]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 1bdef85274..6044784ff9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,84 +88,84 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x10, #0x0\n"
- "mov x14, #0x0\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
"1:" // Tile loop
- "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
"mov x25, #0x2\n"
- "mov x24, #0x2\n"
- "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
- "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
- "cntw x11\n"
- "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "cntw x15\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
- "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "ld1w { z27.s }, p3/Z, [x10]\n"
- "add x27, x13, x13\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
- "add x26, x9, x23, LSL #2\n"
- "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
- "add x25, x26, x23, LSL #2\n"
- "add x24, x27, x13\n"
- "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "cmp x11, %x[n_channels]\n"
- "add x23, x25, x23, LSL #2\n"
- "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "add x22, x28, x22, LSL #2\n"
- "mov x21, #0x0\n"
- "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
- "sub x20, XZR, x11\n"
- "ld1w { z10.s }, p2/Z, [x9]\n"
- "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
- "addvl x10, x10, #-6\n"
- "ld1w { z12.s }, p2/Z, [x26, x27, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "mov x12, #0x0\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x22, x17, x24\n" // offset = tile_i * ld_input_row
+ "mul x21, x17, x23\n" // offset = tile_i * ld_output_row
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1rw { z27.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x28, x14, x14\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x20, XZR, x15\n"
+ "madd x22, x16, x14, x22\n" // offset += tile_j * ld_input_col
+ "ld1w { z25.s }, p3/Z, [x11]\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "add x27, x28, x14\n"
+ "madd x21, x16, x13, x21\n" // offset += tile_j * ld_output_col
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x10, x10, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x26, x10, x24, LSL #2\n"
+ "ld1w { z10.s }, p2/Z, [x10]\n"
+ "ld1w { z11.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "add x25, x26, x24, LSL #2\n"
+ "add x9, x9, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x24, x25, x24, LSL #2\n"
+ "ld1w { z9.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "add x23, x9, x23, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1w { z13.s }, p2/Z, [x25, x14, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
- "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
- "whilelt p1.s, x11, %x[n_channels]\n"
- "incw x21\n"
- "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x23]\n"
- "incw x11\n"
+ "movprfx z24, z25\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z25\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "whilelt p1.s, x15, %x[n_channels]\n"
+ "incw x12\n"
+ "movprfx z22, z25\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z25\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x24]\n"
+ "incw x15\n"
+ "mov p0.b, p2.b\n"
+ "ld1w { z25.s }, p3/Z, [x11]\n"
+ "incw x20\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x28, LSL #2]\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
"fmla z22.s, p3/M, z2.s, z12.s\n"
"fmla z21.s, p3/M, z1.s, z12.s\n"
- "mov p0.b, p2.b\n"
- "ld1w { z27.s }, p3/Z, [x10]\n"
"fmla z24.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
- "incw x20\n"
+ "ld1w { z16.s }, p2/Z, [x10, x14, LSL #2]\n"
"fmla z22.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x28, LSL #2]\n"
+ "addvl x10, x10, #1\n"
"fmla z21.s, p3/M, z3.s, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
- "addvl x9, x9, #1\n"
"fmla z24.s, p3/M, z7.s, z13.s\n"
"fmla z23.s, p3/M, z6.s, z13.s\n"
"fmla z22.s, p3/M, z4.s, z13.s\n"
@@ -173,102 +173,102 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"ld1w { z17.s }, p2/Z, [x26]\n"
"fmla z24.s, p3/M, z1.s, z16.s\n"
"fmla z23.s, p3/M, z0.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x27, LSL #2]\n"
"addvl x26, x26, #1\n"
"fmla z22.s, p3/M, z5.s, z20.s\n"
"fmla z21.s, p3/M, z4.s, z20.s\n"
- "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
"fmla z24.s, p3/M, z2.s, z18.s\n"
"fmla z23.s, p3/M, z1.s, z18.s\n"
"ld1w { z19.s }, p2/Z, [x25]\n"
- "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
"fmla z22.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
"fmla z21.s, p3/M, z2.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z24.s, p3/M, z8.s, z20.s\n"
"fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x25, x27, LSL #2]\n"
"addvl x25, x25, #1\n"
"fmla z22.s, p3/M, z3.s, z19.s\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
- "ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
"fmla z24.s, p3/M, z3.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1w { z13.s }, p1/Z, [x25, x14, LSL #2]\n"
"fmla z23.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "whilelt p2.s, x12, %x[n_channels]\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
"fmla z22.s, p3/M, z7.s, z17.s\n"
"fmla z21.s, p3/M, z6.s, z17.s\n"
- "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x10, x27, LSL #2]\n"
"fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "ld1w { z9.s }, p1/Z, [x26, x14, LSL #2]\n"
"fmla z23.s, p3/M, z8.s, z18.s\n"
- "fmax z24.s, p3/M, z24.s, z26.s\n"
- "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "ld1w { z10.s }, p1/Z, [x10]\n"
"fmla z22.s, p3/M, z8.s, z16.s\n"
"fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z26.s\n"
- "fmax z21.s, p3/M, z21.s, z26.s\n"
- "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
- "cmp x11, %x[n_channels]\n"
- "fmin z24.s, p3/M, z24.s, z25.s\n"
- "ld1w { z10.s }, p1/Z, [x9]\n"
- "ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z25.s\n"
- "fmin z22.s, p3/M, z22.s, z25.s\n"
- "ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
- "st1w { z24.s }, p0, [x28]\n"
- "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "ld1w { z12.s }, p1/Z, [x26, x28, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z27.s\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "fmax z23.s, p3/M, z23.s, z27.s\n"
+ "fmin z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z27.s\n"
+ "fmax z21.s, p3/M, z21.s, z27.s\n"
+ "fmin z23.s, p3/M, z23.s, z26.s\n"
+ "fmin z22.s, p3/M, z22.s, z26.s\n"
+ "st1w { z24.s }, p0, [x9]\n"
+ "fmin z21.s, p3/M, z21.s, z26.s\n"
+ "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x13, LSL #2]\n"
"addvl x23, x23, #1\n"
- "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "st1w { z22.s }, p0, [x22]\n"
- "addvl x28, x28, #1\n"
- "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "addvl x10, x10, #-6\n"
- "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
- "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x23]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z24, z25\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z25\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z25\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z25\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x24]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x16, x16, #0x1\n"
+ "add x20, x17, #0x1\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x28, LSL #2]\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "cmp x16, x22\n"
"fmla z22.s, p3/M, z2.s, z12.s\n"
"fmla z21.s, p3/M, z1.s, z12.s\n"
- "add x14, x14, #0x1\n"
- "cmp x14, x20\n"
+ "csel x17, x17, x20, LT\n"
+ "csel x16, x16, XZR, LT\n"
"fmla z24.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
- "add x21, x10, #0x1\n"
+ "ld1w { z16.s }, p2/Z, [x10, x14, LSL #2]\n"
"fmla z22.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x28, LSL #2]\n"
"fmla z21.s, p3/M, z3.s, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "cmp x17, x21\n"
"fmla z24.s, p3/M, z7.s, z13.s\n"
"fmla z23.s, p3/M, z6.s, z13.s\n"
- "csel x10, x10, x21, LT\n"
- "mov p0.b, p2.b\n"
"fmla z22.s, p3/M, z4.s, z13.s\n"
"fmla z21.s, p3/M, z8.s, z17.s\n"
"ld1w { z17.s }, p2/Z, [x26]\n"
- "csel x14, x14, XZR, LT\n"
"fmla z24.s, p3/M, z1.s, z16.s\n"
"fmla z23.s, p3/M, z0.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
- "cmp x10, x20\n"
+ "ld1w { z16.s }, p2/Z, [x26, x27, LSL #2]\n"
"fmla z22.s, p3/M, z5.s, z20.s\n"
"fmla z21.s, p3/M, z4.s, z20.s\n"
"fmla z24.s, p3/M, z2.s, z18.s\n"
@@ -278,35 +278,35 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
"fmla z21.s, p3/M, z2.s, z16.s\n"
"fmla z24.s, p3/M, z8.s, z20.s\n"
"fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x25, x27, LSL #2]\n"
"fmla z22.s, p3/M, z3.s, z19.s\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
"fmla z24.s, p3/M, z3.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x14, LSL #2]\n"
"fmla z23.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x28, LSL #2]\n"
"fmla z22.s, p3/M, z7.s, z17.s\n"
"fmla z21.s, p3/M, z6.s, z17.s\n"
"fmla z24.s, p3/M, z6.s, z19.s\n"
"fmla z23.s, p3/M, z8.s, z18.s\n"
- "fmax z24.s, p3/M, z24.s, z26.s\n"
- "fmax z23.s, p3/M, z23.s, z26.s\n"
"fmla z22.s, p3/M, z8.s, z16.s\n"
"fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z26.s\n"
- "fmax z21.s, p3/M, z21.s, z26.s\n"
- "fmin z24.s, p3/M, z24.s, z25.s\n"
- "fmin z23.s, p3/M, z23.s, z25.s\n"
- "st1w { z24.s }, p0, [x28]\n"
- "fmin z22.s, p3/M, z22.s, z25.s\n"
- "fmin z21.s, p3/M, z21.s, z25.s\n"
- "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
- "st1w { z22.s }, p0, [x22]\n"
- "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z27.s\n"
+ "fmax z23.s, p3/M, z23.s, z27.s\n"
+ "fmin z24.s, p3/M, z24.s, z26.s\n"
+ "fmin z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z27.s\n"
+ "fmax z21.s, p3/M, z21.s, z27.s\n"
+ "st1w { z24.s }, p0, [x9]\n"
+ "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z26.s\n"
+ "fmin z21.s, p3/M, z21.s, z26.s\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x13, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index 873b4736ff..4b100a9b21 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -83,210 +83,210 @@ void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
"ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
"add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
"cntw x14\n"
- "ldp x13, x12, [x20, #0x0]\n"
- "ldp x11, x10, [x20, #0x10]\n"
- "mov x9, #0x0\n"
+ "mov x13, #0x0\n"
+ "ldr x24, [x15, #0x20]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z20.s }, p3/Z, [x16]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "ld1w { z27.s }, p3/Z, [x16]\n"
"ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "cmp x14, %x[n_channels]\n"
"ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
"ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
"sub x28, XZR, x14\n"
"ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
"ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
"addvl x16, x16, #16\n"
- "ldp x24, x23, [x15, #0x0]\n"
- "ldp x22, x21, [x15, #0x10]\n"
- "ldr x20, [x15, #0x20]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
"ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
"ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
"ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
"addvl x16, x16, #-6\n"
- "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
- "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
"ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x22, [x15, #0x38]\n"
+ "ldr x25, [x15, #0x30]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ldr x24, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x23, [x15, #0x40]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "incw x28\n"
+ "ld1w { z18.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "mov p0.b, p2.b\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x48]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x13, LSL #2]\n"
"fmla z22.s, p3/M, z2.s, z12.s\n"
"fmla z21.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x15, #0x40]\n"
- "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "ld1w { z27.s }, p3/Z, [x16]\n"
"fmla z24.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x22, [x15, #0x50]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x13, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
"fmla z22.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
"fmla z21.s, p3/M, z3.s, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x58]\n"
"fmla z24.s, p3/M, z7.s, z13.s\n"
"fmla z23.s, p3/M, z6.s, z13.s\n"
- "ldr x20, [x15, #0x60]\n"
- "ldr x27, [x15, #0x68]\n"
"fmla z22.s, p3/M, z4.s, z13.s\n"
"fmla z21.s, p3/M, z8.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x26, [x15, #0x70]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x13, LSL #2]\n"
"fmla z24.s, p3/M, z1.s, z16.s\n"
"fmla z23.s, p3/M, z0.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x25, [x15, #0x78]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
"fmla z22.s, p3/M, z5.s, z20.s\n"
"fmla z21.s, p3/M, z4.s, z20.s\n"
- "whilelt p1.s, x14, %x[n_channels]\n"
- "ldp x24, x23, [x15, #0x0]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
"fmla z24.s, p3/M, z2.s, z18.s\n"
"fmla z23.s, p3/M, z1.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldp x22, x21, [x15, #0x10]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
"fmla z22.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
"fmla z21.s, p3/M, z2.s, z16.s\n"
- "ldr x20, [x15, #0x20]\n"
- "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
"fmla z24.s, p3/M, z8.s, z20.s\n"
+ "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
"fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
- "incw x28\n"
+ "ld1w { z18.s }, p2/Z, [x27, x13, LSL #2]\n"
"fmla z22.s, p3/M, z3.s, z19.s\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
- "mov p0.b, p2.b\n"
- "ld1w { z20.s }, p3/Z, [x16]\n"
"fmla z24.s, p3/M, z3.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
"fmla z23.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "incw x13\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
"fmla z22.s, p3/M, z7.s, z17.s\n"
"fmla z21.s, p3/M, z6.s, z17.s\n"
- "incw x9\n"
"ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
"fmla z24.s, p3/M, z6.s, z19.s\n"
- "fmla z23.s, p3/M, z8.s, z18.s\n"
"ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
"ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "whilelt p2.s, x13, %x[n_channels]\n"
"fmla z22.s, p3/M, z8.s, z16.s\n"
"fmla z21.s, p3/M, z7.s, z16.s\n"
"ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
"incw x14\n"
"fmax z24.s, p3/M, z24.s, z26.s\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
"fmax z23.s, p3/M, z23.s, z26.s\n"
- "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
"fmax z22.s, p3/M, z22.s, z26.s\n"
"fmax z21.s, p3/M, z21.s, z26.s\n"
- "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
"cmp x14, %x[n_channels]\n"
- "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
"fmin z24.s, p3/M, z24.s, z25.s\n"
- "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
"fmin z23.s, p3/M, z23.s, z25.s\n"
"fmin z22.s, p3/M, z22.s, z25.s\n"
- "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
"fmin z21.s, p3/M, z21.s, z25.s\n"
- "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z24.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z23.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x9, x28, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
- "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
- "ldr x21, [x15, #0x28]\n"
- "ldr x20, [x15, #0x30]\n"
- "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x22, [x15, #0x38]\n"
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x21, [x15, #0x30]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ldr x27, [x15, #0x38]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr x26, [x15, #0x40]\n"
+ "ldr x25, [x15, #0x50]\n"
+ "incw x28\n"
+ "mov p0.b, p2.b\n"
+ "ld1w { z18.s }, p2/Z, [x22, x13, LSL #2]\n"
+ "ldr x24, [x15, #0x58]\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
"fmla z23.s, p3/M, z2.s, z11.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x48]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x13, LSL #2]\n"
"fmla z22.s, p3/M, z2.s, z12.s\n"
"fmla z21.s, p3/M, z1.s, z12.s\n"
- "ldr x20, [x15, #0x40]\n"
- "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
"fmla z24.s, p3/M, z5.s, z12.s\n"
"fmla z23.s, p3/M, z4.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x21, [x15, #0x50]\n"
+ "ld1w { z16.s }, p2/Z, [x27, x13, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
"fmla z22.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x13, LSL #2]\n"
"fmla z21.s, p3/M, z3.s, z13.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x15, #0x58]\n"
"fmla z24.s, p3/M, z7.s, z13.s\n"
"fmla z23.s, p3/M, z6.s, z13.s\n"
- "ldr x23, [x15, #0x60]\n"
- "ldr x22, [x15, #0x68]\n"
"fmla z22.s, p3/M, z4.s, z13.s\n"
"fmla z21.s, p3/M, z8.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ldr x21, [x15, #0x70]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x13, LSL #2]\n"
"fmla z24.s, p3/M, z1.s, z16.s\n"
"fmla z23.s, p3/M, z0.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x15, #0x78]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x13, LSL #2]\n"
"fmla z22.s, p3/M, z5.s, z20.s\n"
"fmla z21.s, p3/M, z4.s, z20.s\n"
- "incw x28\n"
- "mov p0.b, p2.b\n"
"fmla z24.s, p3/M, z2.s, z18.s\n"
"fmla z23.s, p3/M, z1.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x23, x13, LSL #2]\n"
"fmla z22.s, p3/M, z0.s, z17.s\n"
"fmla z21.s, p3/M, z2.s, z16.s\n"
"fmla z24.s, p3/M, z8.s, z20.s\n"
"fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x22, x13, LSL #2]\n"
"fmla z22.s, p3/M, z3.s, z19.s\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
"fmla z24.s, p3/M, z3.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x13, LSL #2]\n"
"fmla z23.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x13, LSL #2]\n"
"fmla z22.s, p3/M, z7.s, z17.s\n"
"fmla z21.s, p3/M, z6.s, z17.s\n"
"fmla z24.s, p3/M, z6.s, z19.s\n"
"fmla z23.s, p3/M, z8.s, z18.s\n"
- "fmax z24.s, p3/M, z24.s, z26.s\n"
- "fmax z23.s, p3/M, z23.s, z26.s\n"
"fmla z22.s, p3/M, z8.s, z16.s\n"
"fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z26.s\n"
- "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
"fmin z24.s, p3/M, z24.s, z25.s\n"
"fmin z23.s, p3/M, z23.s, z25.s\n"
- "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "st1w { z24.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z23.s }, p0, [x11, x28, LSL #2]\n"
"fmin z22.s, p3/M, z22.s, z25.s\n"
"fmin z21.s, p3/M, z21.s, z25.s\n"
- "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x10, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x9, x28, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
index 015d0e63c2..17a8933c3f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,369 +88,369 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x13, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x5, #0x0\n"
+ "mov x6, #0x0\n"
"1:" // Tile loop
- "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x3\n"
"mov x25, #0x3\n"
- "mov x24, #0x3\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
- "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cntw x15\n"
- "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
- "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x12, x17, x17\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x10, x14, x23, LSL #2\n"
- "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
- "add x9, x10, x23, LSL #2\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntw x8\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z14.s }, p3/Z, [x13]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
- "add x28, x9, x23, LSL #2\n"
- "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
- "add x27, x12, x17\n"
- "add x11, x11, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
- "add x26, x28, x23, LSL #2\n"
- "add x25, x27, x17\n"
- "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
- "addvl x13, x13, #16\n"
- "add x24, x11, x21, LSL #2\n"
- "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "cmp x15, %x[n_channels]\n"
- "add x23, x24, x21, LSL #2\n"
- "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
- "add x22, x16, x16\n"
- "mov x21, #0x0\n"
- "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
- "sub x20, XZR, x15\n"
- "ld1w { z10.s }, p2/Z, [x14]\n"
- "ld1w { z11.s }, p2/Z, [x14, x25, LSL #2]\n"
- "addvl x13, x13, #-6\n"
- "ld1w { z12.s }, p2/Z, [x26]\n"
- "ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
+ "mov x16, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x5, x24\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x12, x7, x7\n"
+ "cmp x8, %x[n_channels]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x21, x5, x23\n" // offset = tile_i * ld_output_row
+ "add x11, x12, x7\n"
+ "add x10, x17, x17\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "madd x22, x6, x7, x22\n" // offset += tile_j * ld_input_col
+ "ld1w { z31.s }, p3/Z, [x14]\n"
+ "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+ "add x9, x11, x7\n"
+ "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+ "sub x20, XZR, x8\n"
+ "madd x21, x6, x17, x21\n" // offset += tile_j * ld_output_col
+ "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x15, x15, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x28, x15, x24, LSL #2\n"
+ "add x27, x28, x24, LSL #2\n"
+ "ld1w { z10.s }, p2/Z, [x15]\n"
+ "ld1w { z11.s }, p2/Z, [x15, x9, LSL #2]\n"
+ "add x26, x27, x24, LSL #2\n"
+ "add x13, x13, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x25, x26, x24, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+ "add x24, x13, x23, LSL #2\n"
+ "ld1w { z9.s }, p2/Z, [x27, x12, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x25]\n"
+ "addvl x14, x14, #-6\n"
+ "add x23, x24, x23, LSL #2\n"
+ "ld1w { z13.s }, p2/Z, [x28, x12, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
- "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x15, %x[n_channels]\n"
- "incw x21\n"
- "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "incw x15\n"
+ "movprfx z30, z31\n fmla z30.s, p3/M, z7.s, z9.s\n"
+ "movprfx z29, z31\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x8, %x[n_channels]\n"
+ "incw x16\n"
+ "movprfx z28, z31\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z31\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "incw x8\n"
"mov p0.b, p2.b\n"
- "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z31\n fmla z26.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z31\n fmla z25.s, p3/M, z3.s, z9.s\n"
"incw x20\n"
- "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "fmla z29.s, p3/M, z6.s, z18.s\n"
- "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z27.s, p3/M, z3.s, z13.s\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z25.s, p3/M, z1.s, z13.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z17.s\n"
- "ld1w { z14.s }, p3/Z, [x13]\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z18.s\n"
- "fmla z20.s, p3/M, z0.s, z18.s\n"
- "fmla z26.s, p3/M, z4.s, z18.s\n"
- "fmla z25.s, p3/M, z3.s, z18.s\n"
- "fmla z22.s, p3/M, z1.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x10]\n"
- "fmla z29.s, p3/M, z2.s, z16.s\n"
- "fmla z27.s, p3/M, z1.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x28]\n"
- "fmla z24.s, p3/M, z4.s, z23.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z20.s, p3/M, z2.s, z23.s\n"
- "fmla z21.s, p3/M, z1.s, z23.s\n"
- "fmla z29.s, p3/M, z8.s, z23.s\n"
- "fmla z27.s, p3/M, z7.s, z23.s\n"
- "fmla z25.s, p3/M, z5.s, z23.s\n"
- "fmla z26.s, p3/M, z0.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
- "fmla z20.s, p3/M, z4.s, z17.s\n"
- "fmla z21.s, p3/M, z3.s, z17.s\n"
- "fmla z28.s, p3/M, z3.s, z19.s\n"
- "fmla z27.s, p3/M, z5.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z18.s\n"
- "fmla z25.s, p3/M, z7.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z22.s, p3/M, z5.s, z17.s\n"
- "fmla z24.s, p3/M, z6.s, z17.s\n"
- "fmla z21.s, p3/M, z5.s, z19.s\n"
- "fmla z20.s, p3/M, z6.s, z16.s\n"
- "fmla z26.s, p3/M, z8.s, z17.s\n"
- "fmla z22.s, p3/M, z7.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z18.s\n"
- "fmla z25.s, p3/M, z0.s, z18.s\n"
- "fmla z24.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z17.s\n"
- "addvl x10, x10, #1\n"
- "fmla z21.s, p3/M, z7.s, z17.s\n"
- "fmla z28.s, p3/M, z4.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z18.s\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
- "addvl x28, x28, #1\n"
- "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "movprfx z24, z31\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z23, z31\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z22.s }, p2/Z, [x27, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z13.s\n"
+ "fmla z26.s, p3/M, z1.s, z13.s\n"
+ "fmla z25.s, p3/M, z0.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "movprfx z21, z31\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z31.s }, p3/Z, [x14]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z0.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z29.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x28]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z25.s, p3/M, z4.s, z22.s\n"
+ "fmla z23.s, p3/M, z1.s, z22.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "fmla z21.s, p3/M, z2.s, z22.s\n"
+ "fmla z27.s, p3/M, z0.s, z20.s\n"
+ "fmla z30.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z22.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
"fmla z25.s, p3/M, z2.s, z16.s\n"
- "fmla z24.s, p3/M, z1.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "addvl x14, x14, #1\n"
- "fmla z20.s, p3/M, z3.s, z17.s\n"
- "fmla z21.s, p3/M, z4.s, z19.s\n"
- "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
- "ld1w { z10.s }, p1/Z, [x14]\n"
- "fmla z26.s, p3/M, z7.s, z17.s\n"
- "fmla z25.s, p3/M, z6.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x9]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "fmax z29.s, p3/M, z29.s, z31.s\n"
- "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
- "fmla z27.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z24.s, p3/M, z7.s, z19.s\n"
- "addvl x9, x9, #1\n"
- "fmla z20.s, p3/M, z5.s, z19.s\n"
- "fmla z22.s, p3/M, z0.s, z18.s\n"
- "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
- "fmin z29.s, p3/M, z29.s, z30.s\n"
- "fmla z21.s, p3/M, z2.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z25.s, p3/M, z25.s, z31.s\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "fmla z26.s, p3/M, z3.s, z18.s\n"
- "fmax z28.s, p3/M, z28.s, z31.s\n"
- "fmax z26.s, p3/M, z26.s, z31.s\n"
- "fmla z27.s, p3/M, z8.s, z17.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z31.s\n"
- "fmax z24.s, p3/M, z24.s, z31.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "fmla z20.s, p3/M, z7.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z31.s\n"
- "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z27.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x7, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z23.s, p3/M, z3.s, z18.s\n"
+ "fmla z26.s, p3/M, z7.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z18.s\n"
+ "fmla z25.s, p3/M, z6.s, z18.s\n"
+ "fmla z27.s, p3/M, z8.s, z18.s\n"
+ "fmla z30.s, p3/M, z3.s, z19.s\n"
"fmla z21.s, p3/M, z6.s, z16.s\n"
- "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z5.s, z17.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x11, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x11, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x7, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "fmla z23.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x11, LSL #2]\n"
"addvl x26, x26, #1\n"
- "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "cmp x15, %x[n_channels]\n"
- "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
- "addvl x13, x13, #16\n"
- "fmin z28.s, p3/M, z28.s, z30.s\n"
- "ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z30.s\n"
- "fmin z26.s, p3/M, z26.s, z30.s\n"
- "ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x26]\n"
- "fmin z25.s, p3/M, z25.s, z30.s\n"
- "fmin z24.s, p3/M, z24.s, z30.s\n"
- "ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
- "st1w { z28.s }, p0, [x11]\n"
- "fmin z22.s, p3/M, z22.s, z30.s\n"
- "fmin z20.s, p3/M, z20.s, z30.s\n"
- "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
- "fmin z21.s, p3/M, z21.s, z30.s\n"
- "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
- "addvl x11, x11, #1\n"
- "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
- "st1w { z26.s }, p0, [x24]\n"
- "addvl x13, x13, #-6\n"
- "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x12, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z17.s\n"
+ "addvl x15, x15, #1\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z17.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x27]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p1/Z, [x15]\n"
+ "fmla z28.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "addvl x27, x27, #1\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z24.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x12, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "addvl x25, x25, #1\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "whilelt p2.s, x16, %x[n_channels]\n"
+ "cmp x8, %x[n_channels]\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "ld1w { z9.s }, p1/Z, [x27, x12, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x15, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "ld1w { z12.s }, p1/Z, [x25]\n"
+ "ld1w { z13.s }, p1/Z, [x28, x12, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z29.s }, p0, [x13]\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z30.s }, p0, [x13, x17, LSL #2]\n"
+ "st1w { z28.s }, p0, [x13, x10, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "addvl x14, x14, #-6\n"
+ "st1w { z27.s }, p0, [x24]\n"
+ "st1w { z26.s }, p0, [x24, x17, LSL #2]\n"
+ "st1w { z25.s }, p0, [x24, x10, LSL #2]\n"
"addvl x24, x24, #1\n"
- "st1w { z22.s }, p0, [x23]\n"
- "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x17, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x10, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
- "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z4.s, z13.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x8, x8, #0x1\n"
- "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
- "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
- "cmp x8, x20\n"
- "add x21, x13, #0x1\n"
- "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
- "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "csel x13, x13, x21, LT\n"
- "fmla z29.s, p3/M, z6.s, z18.s\n"
- "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z31\n fmla z30.s, p3/M, z7.s, z9.s\n"
+ "movprfx z29, z31\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z28, z31\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z31\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "movprfx z26, z31\n fmla z26.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z31\n fmla z25.s, p3/M, z3.s, z9.s\n"
"mov p0.b, p2.b\n"
- "csel x8, x8, XZR, LT\n"
- "fmla z28.s, p3/M, z5.s, z13.s\n"
- "fmla z27.s, p3/M, z3.s, z13.s\n"
- "cmp x13, x20\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z25.s, p3/M, z1.s, z13.s\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
- "fmla z22.s, p3/M, z6.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
- "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
- "fmla z28.s, p3/M, z7.s, z18.s\n"
- "fmla z20.s, p3/M, z0.s, z18.s\n"
- "fmla z26.s, p3/M, z4.s, z18.s\n"
- "fmla z25.s, p3/M, z3.s, z18.s\n"
- "fmla z22.s, p3/M, z1.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x10]\n"
- "fmla z29.s, p3/M, z2.s, z16.s\n"
- "fmla z27.s, p3/M, z1.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x28]\n"
- "fmla z24.s, p3/M, z4.s, z23.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
- "fmla z20.s, p3/M, z2.s, z23.s\n"
- "fmla z21.s, p3/M, z1.s, z23.s\n"
- "fmla z29.s, p3/M, z8.s, z23.s\n"
- "fmla z27.s, p3/M, z7.s, z23.s\n"
- "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "movprfx z24, z31\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z23, z31\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "add x6, x6, #0x1\n"
+ "add x20, x5, #0x1\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z22.s }, p2/Z, [x27, x11, LSL #2]\n"
+ "cmp x6, x22\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z13.s\n"
+ "csel x5, x5, x20, LT\n"
+ "fmla z26.s, p3/M, z1.s, z13.s\n"
+ "fmla z25.s, p3/M, z0.s, z13.s\n"
+ "csel x6, x6, XZR, LT\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "movprfx z21, z31\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "cmp x5, x21\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z0.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z29.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x28]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z25.s, p3/M, z4.s, z22.s\n"
+ "fmla z23.s, p3/M, z1.s, z22.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "fmla z21.s, p3/M, z2.s, z22.s\n"
+ "fmla z27.s, p3/M, z0.s, z20.s\n"
+ "fmla z30.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z22.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x7, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z23.s, p3/M, z3.s, z18.s\n"
+ "fmla z26.s, p3/M, z7.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z18.s\n"
+ "fmla z25.s, p3/M, z6.s, z18.s\n"
+ "fmla z27.s, p3/M, z8.s, z18.s\n"
+ "fmla z30.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z5.s, z17.s\n"
"fmla z26.s, p3/M, z0.s, z19.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
- "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x11, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x11, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x7, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "fmla z23.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, x12, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z17.s\n"
"fmla z21.s, p3/M, z3.s, z17.s\n"
- "fmla z28.s, p3/M, z3.s, z19.s\n"
- "fmla z27.s, p3/M, z5.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
- "fmla z26.s, p3/M, z6.s, z18.s\n"
- "fmla z25.s, p3/M, z7.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
- "fmla z22.s, p3/M, z5.s, z17.s\n"
- "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z17.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x27]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "fmla z28.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
"fmla z21.s, p3/M, z5.s, z19.s\n"
- "fmla z20.s, p3/M, z6.s, z16.s\n"
- "fmla z26.s, p3/M, z8.s, z17.s\n"
- "fmla z22.s, p3/M, z7.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z18.s\n"
- "fmla z25.s, p3/M, z0.s, z18.s\n"
- "fmla z24.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
- "fmla z20.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z7.s, z17.s\n"
- "fmla z28.s, p3/M, z4.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z26.s, p3/M, z1.s, z18.s\n"
- "fmla z29.s, p3/M, z5.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z16.s\n"
- "fmla z25.s, p3/M, z2.s, z16.s\n"
- "fmla z24.s, p3/M, z1.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z20.s, p3/M, z3.s, z17.s\n"
- "fmla z21.s, p3/M, z4.s, z19.s\n"
- "fmla z26.s, p3/M, z7.s, z17.s\n"
- "fmla z25.s, p3/M, z6.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x9]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z29.s, p3/M, z1.s, z16.s\n"
- "fmax z29.s, p3/M, z29.s, z31.s\n"
- "fmin z29.s, p3/M, z29.s, z30.s\n"
- "fmla z27.s, p3/M, z0.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
- "fmla z24.s, p3/M, z7.s, z19.s\n"
- "fmla z20.s, p3/M, z5.s, z19.s\n"
- "fmla z22.s, p3/M, z0.s, z18.s\n"
- "fmla z21.s, p3/M, z2.s, z17.s\n"
- "fmla z25.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
- "fmax z25.s, p3/M, z25.s, z31.s\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "fmla z26.s, p3/M, z3.s, z18.s\n"
- "fmax z28.s, p3/M, z28.s, z31.s\n"
- "fmax z26.s, p3/M, z26.s, z31.s\n"
- "fmla z27.s, p3/M, z8.s, z17.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z31.s\n"
- "fmax z24.s, p3/M, z24.s, z31.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "fmla z20.s, p3/M, z7.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z31.s\n"
- "fmax z20.s, p3/M, z20.s, z31.s\n"
- "fmla z21.s, p3/M, z6.s, z16.s\n"
- "fmax z21.s, p3/M, z21.s, z31.s\n"
- "fmin z28.s, p3/M, z28.s, z30.s\n"
- "st1w { z28.s }, p0, [x11]\n"
- "fmin z27.s, p3/M, z27.s, z30.s\n"
- "fmin z26.s, p3/M, z26.s, z30.s\n"
- "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z30.s\n"
- "fmin z24.s, p3/M, z24.s, z30.s\n"
- "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
- "fmin z22.s, p3/M, z22.s, z30.s\n"
- "fmin z20.s, p3/M, z20.s, z30.s\n"
- "st1w { z26.s }, p0, [x24]\n"
- "fmin z21.s, p3/M, z21.s, z30.s\n"
- "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
- "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
- "st1w { z22.s }, p0, [x23]\n"
- "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
- "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x12, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "st1w { z27.s }, p0, [x24]\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z26.s }, p0, [x24, x17, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z29.s }, p0, [x13]\n"
+ "st1w { z30.s }, p0, [x13, x17, LSL #2]\n"
+ "st1w { z28.s }, p0, [x13, x10, LSL #2]\n"
+ "st1w { z25.s }, p0, [x24, x10, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x17, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x10, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index 4809b0c45c..27baf11d17 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -90,384 +90,384 @@ void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
"add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z14.s }, p3/Z, [x8]\n"
- "cntw x16\n"
- "mov x15, #0x0\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "cntw x15\n"
+ "mov x14, #0x0\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ld1w { z15.s }, p3/Z, [x8]\n"
"ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
"ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "sub x13, XZR, x15\n"
"ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
- "sub x14, XZR, x16\n"
"ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
"addvl x8, x8, #16\n"
- "ldp x24, x23, [x17, #0x0]\n"
- "ldp x22, x21, [x17, #0x10]\n"
- "ldr x20, [x17, #0x20]\n"
- "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x14, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
- "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
"addvl x8, x8, #-6\n"
- "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x14, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
- "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
- "ldr x23, [x17, #0x30]\n"
- "ldr x26, [x17, #0x38]\n"
- "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ldr x22, [x17, #0x28]\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x27, [x17, #0x38]\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "ldr x26, [x17, #0x28]\n"
"ldr x21, [x17, #0x48]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z3.s, z9.s\n"
"ldr x20, [x17, #0x40]\n"
- "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
- "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
- "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
"ldr x25, [x17, #0x50]\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z0.s, z9.s\n"
"ldr x24, [x17, #0x58]\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
- "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
"ldr x23, [x17, #0x60]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "ldr x12, [x17, #0x70]\n"
- "ldr x11, [x17, #0x88]\n"
- "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
- "fmla z27.s, p3/M, z3.s, z13.s\n"
- "incw x14\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z22.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x12, [x17, #0x88]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z13.s\n"
+ "ldr x22, [x17, #0x70]\n"
+ "fmla z26.s, p3/M, z1.s, z13.s\n"
+ "fmla z25.s, p3/M, z0.s, z13.s\n"
+ "incw x13\n"
"mov p1.b, p2.b\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z25.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x13, #0x0]\n"
- "whilelt p0.s, x16, %x[n_channels]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
- "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
- "fmla z29.s, p3/M, z7.s, z18.s\n"
- "ldr x22, [x17, #0x68]\n"
- "ldr x21, [x17, #0x78]\n"
- "fmla z28.s, p3/M, z0.s, z17.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z17.s\n"
+ "ldr x11, [x16, #0x0]\n"
+ "whilelt p0.s, x15, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "ldr x10, [x17, #0x78]\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
"ldr x20, [x17, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z18.s\n"
- "fmla z25.s, p3/M, z3.s, z18.s\n"
- "ldr x9, [x13, #0x8]\n"
- "ldr x28, [x13, #0x10]\n"
- "fmla z21.s, p3/M, z0.s, z18.s\n"
- "fmla z24.s, p3/M, z4.s, z19.s\n"
- "ldr x27, [x13, #0x18]\n"
- "ld1w { z14.s }, p3/Z, [x8]\n"
- "fmla z23.s, p3/M, z1.s, z18.s\n"
- "fmla z29.s, p3/M, z1.s, z17.s\n"
- "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z27.s, p3/M, z1.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z0.s, z17.s\n"
+ "fmla z25.s, p3/M, z4.s, z22.s\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "ldr x27, [x16, #0x18]\n"
+ "ld1w { z15.s }, p3/Z, [x8]\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x14, LSL #2]\n"
"ldr x26, [x17, #0x90]\n"
- "fmla z25.s, p3/M, z5.s, z19.s\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "ldr x25, [x17, #0xa0]\n"
- "ldr x24, [x17, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z20.s\n"
- "fmla z24.s, p3/M, z2.s, z17.s\n"
- "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "fmla z23.s, p3/M, z1.s, z22.s\n"
+ "fmla z21.s, p3/M, z2.s, z22.s\n"
+ "fmla z30.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z20.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z28.s, p3/M, z7.s, z22.s\n"
+ "fmla z25.s, p3/M, z2.s, z20.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "ldr x23, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x22, [x17, #0xa8]\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ldr x21, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmla z23.s, p3/M, z3.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z30.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "fmla z26.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x17, #0x20]\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x14, LSL #2]\n"
"fmla z27.s, p3/M, z7.s, z19.s\n"
- "fmla z22.s, p3/M, z1.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ldr x23, [x17, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmla z26.s, p3/M, z6.s, z19.s\n"
+ "fmla z28.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z21.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x14, LSL #2]\n"
"fmla z25.s, p3/M, z7.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x22, [x17, #0xc0]\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "fmla z21.s, p3/M, z4.s, z18.s\n"
- "fmla z29.s, p3/M, z3.s, z20.s\n"
- "fmla z27.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z18.s\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "ldr x21, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "fmla z23.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
"fmla z26.s, p3/M, z8.s, z18.s\n"
- "fmla z24.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z6.s, z16.s\n"
- "fmla z28.s, p3/M, z3.s, z19.s\n"
- "fmla z25.s, p3/M, z0.s, z19.s\n"
- "fmla z22.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z19.s\n"
- "fmla z26.s, p3/M, z1.s, z19.s\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z17.s\n"
- "fmla z25.s, p3/M, z2.s, z17.s\n"
- "fmla z24.s, p3/M, z1.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
- "ldr x25, [x17, #0x20]\n"
- "fmla z22.s, p3/M, z7.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z17.s\n"
- "fmla z26.s, p3/M, z7.s, z16.s\n"
- "fmla z25.s, p3/M, z6.s, z16.s\n"
- "fmla z23.s, p3/M, z4.s, z16.s\n"
- "fmla z21.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z18.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z31.s\n"
- "fmin z28.s, p3/M, z28.s, z30.s\n"
- "fmla z27.s, p3/M, z0.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z16.s\n"
- "fmax z29.s, p3/M, z29.s, z31.s\n"
- "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z27.s, p3/M, z3.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
- "fmin z29.s, p3/M, z29.s, z30.s\n"
- "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z0.s, z16.s\n"
- "fmla z22.s, p3/M, z2.s, z17.s\n"
- "ldr x24, [x13, #0x20]\n"
- "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z18.s\n"
- "fmla z26.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
"ldp x23, x22, [x17, #0x0]\n"
- "fmla z27.s, p3/M, z8.s, z17.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
"ldp x21, x20, [x17, #0x10]\n"
- "fmax z27.s, p3/M, z27.s, z31.s\n"
- "fmla z23.s, p3/M, z8.s, z16.s\n"
- "fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmax z26.s, p3/M, z26.s, z31.s\n"
- "fmax z25.s, p3/M, z25.s, z31.s\n"
- "fmla z22.s, p3/M, z6.s, z16.s\n"
- "incw x15\n"
- "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
- "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z30.s\n"
- "fmin z26.s, p3/M, z26.s, z30.s\n"
- "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
- "incw x16\n"
- "fmin z25.s, p3/M, z25.s, z30.s\n"
- "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
- "fmax z24.s, p3/M, z24.s, z31.s\n"
- "fmax z23.s, p3/M, z23.s, z31.s\n"
- "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
- "ldr x23, [x13, #0x28]\n"
- "fmax z21.s, p3/M, z21.s, z31.s\n"
- "fmax z22.s, p3/M, z22.s, z31.s\n"
- "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
- "ldr x22, [x13, #0x30]\n"
- "ldr x21, [x13, #0x38]\n"
- "ldr x20, [x13, #0x40]\n"
- "whilelt p2.s, x15, %x[n_channels]\n"
- "cmp x16, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
- "fmin z24.s, p3/M, z24.s, z30.s\n"
- "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "incw x14\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "fmin z29.s, p3/M, z29.s, z31.s\n"
"ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
- "fmin z21.s, p3/M, z21.s, z30.s\n"
- "fmin z22.s, p3/M, z22.s, z30.s\n"
- "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
"ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
- "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "ld1w { z9.s }, p0/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z31.s\n"
+ "ld1w { z10.s }, p0/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z11.s }, p0/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z12.s }, p0/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z13.s }, p0/Z, [x24, x15, LSL #2]\n"
+ "incw x15\n"
+ "fmin z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z27.s, p3/M, z27.s, z31.s\n"
+ "st1w { z29.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x28]\n"
+ "st1w { z30.s }, p1, [x11, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmin z26.s, p3/M, z26.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
"ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z28.s }, p1, [x28, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x30]\n"
"addvl x8, x8, #16\n"
- "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+ "st1w { z27.s }, p1, [x27, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x38]\n"
+ "whilelt p2.s, x14, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "st1w { z26.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x40]\n"
+ "fmin z25.s, p3/M, z25.s, z31.s\n"
+ "fmin z24.s, p3/M, z24.s, z31.s\n"
+ "fmin z21.s, p3/M, z21.s, z31.s\n"
+ "fmin z23.s, p3/M, z23.s, z31.s\n"
"ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
- "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
"ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
"addvl x8, x8, #-6\n"
- "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
+ "st1w { z25.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z24.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z21.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
- "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
- "ldr x23, [x17, #0x30]\n"
- "ldr x26, [x17, #0x38]\n"
- "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
- "fmla z29.s, p3/M, z0.s, z10.s\n"
- "ldr x22, [x17, #0x28]\n"
+ "movprfx z30, z15\n fmla z30.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z15\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x27, [x17, #0x38]\n"
+ "movprfx z28, z15\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z15\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "ldr x26, [x17, #0x28]\n"
"ldr x21, [x17, #0x48]\n"
- "fmla z28.s, p3/M, z4.s, z13.s\n"
- "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z26, z15\n fmla z26.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z15\n fmla z25.s, p3/M, z3.s, z9.s\n"
"ldr x20, [x17, #0x40]\n"
- "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
- "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
- "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
"ldr x25, [x17, #0x50]\n"
+ "movprfx z24, z15\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "movprfx z23, z15\n fmla z23.s, p3/M, z0.s, z9.s\n"
"ldr x24, [x17, #0x58]\n"
- "fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
- "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
"ldr x23, [x17, #0x60]\n"
- "fmla z29.s, p3/M, z5.s, z13.s\n"
- "fmla z28.s, p3/M, z6.s, z18.s\n"
- "ldr x12, [x17, #0x70]\n"
- "ldr x11, [x17, #0x88]\n"
- "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
- "fmla z27.s, p3/M, z3.s, z13.s\n"
- "incw x14\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "ld1w { z22.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x12, [x17, #0x88]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z13.s\n"
+ "ldr x22, [x17, #0x70]\n"
+ "fmla z26.s, p3/M, z1.s, z13.s\n"
+ "fmla z25.s, p3/M, z0.s, z13.s\n"
+ "incw x13\n"
"mov p0.b, p2.b\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z25.s, p3/M, z1.s, z13.s\n"
- "ldr x10, [x13, #0x0]\n"
- "ldr x9, [x13, #0x8]\n"
- "fmla z24.s, p3/M, z0.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z6.s, z12.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
- "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
- "fmla z29.s, p3/M, z7.s, z18.s\n"
- "ldr x22, [x17, #0x68]\n"
- "ldr x21, [x17, #0x78]\n"
- "fmla z28.s, p3/M, z0.s, z17.s\n"
- "fmla z22.s, p3/M, z8.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "movprfx z21, z15\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z17.s\n"
+ "ldr x11, [x16, #0x0]\n"
+ "ldr x10, [x16, #0x8]\n"
+ "fmla z28.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "ldr x9, [x17, #0x78]\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
"ldr x20, [x17, #0x80]\n"
- "fmla z26.s, p3/M, z4.s, z18.s\n"
- "fmla z25.s, p3/M, z3.s, z18.s\n"
- "ldr x28, [x13, #0x10]\n"
- "ldr x27, [x13, #0x18]\n"
- "fmla z21.s, p3/M, z0.s, z18.s\n"
- "fmla z24.s, p3/M, z4.s, z19.s\n"
- "fmla z23.s, p3/M, z1.s, z18.s\n"
- "fmla z29.s, p3/M, z1.s, z17.s\n"
- "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
- "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z16.s\n"
- "fmla z27.s, p3/M, z1.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z0.s, z17.s\n"
+ "fmla z25.s, p3/M, z4.s, z22.s\n"
+ "ldr x28, [x16, #0x10]\n"
+ "ldr x27, [x16, #0x18]\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x14, LSL #2]\n"
"ldr x26, [x17, #0x90]\n"
- "fmla z25.s, p3/M, z5.s, z19.s\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "ldr x25, [x17, #0xa0]\n"
- "ldr x24, [x17, #0x98]\n"
- "fmla z26.s, p3/M, z0.s, z20.s\n"
- "fmla z24.s, p3/M, z2.s, z17.s\n"
- "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "fmla z23.s, p3/M, z1.s, z22.s\n"
+ "fmla z21.s, p3/M, z2.s, z22.s\n"
+ "fmla z30.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z20.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z28.s, p3/M, z7.s, z22.s\n"
+ "fmla z25.s, p3/M, z2.s, z20.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "ldr x23, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x22, [x17, #0xa8]\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ldr x21, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmla z23.s, p3/M, z3.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z30.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "fmla z26.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z16.s\n"
+ "fmla z26.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x14, LSL #2]\n"
"fmla z27.s, p3/M, z7.s, z19.s\n"
- "fmla z22.s, p3/M, z1.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
- "ldr x23, [x17, #0xa8]\n"
- "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmla z26.s, p3/M, z6.s, z19.s\n"
+ "fmla z28.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z21.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x14, LSL #2]\n"
"fmla z25.s, p3/M, z7.s, z18.s\n"
- "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
- "ldr x22, [x17, #0xc0]\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "fmla z21.s, p3/M, z4.s, z18.s\n"
- "fmla z29.s, p3/M, z3.s, z20.s\n"
- "fmla z27.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z5.s, z18.s\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "ldr x21, [x17, #0xb0]\n"
- "ldr x20, [x17, #0xb8]\n"
+ "fmla z23.s, p3/M, z4.s, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z14.s\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z0.s, z16.s\n"
"fmla z26.s, p3/M, z8.s, z18.s\n"
- "fmla z24.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z6.s, z16.s\n"
- "fmla z28.s, p3/M, z3.s, z19.s\n"
- "fmla z25.s, p3/M, z0.s, z19.s\n"
- "fmla z22.s, p3/M, z5.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z4.s, z19.s\n"
- "fmla z26.s, p3/M, z1.s, z19.s\n"
- "fmla z28.s, p3/M, z5.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z17.s\n"
- "fmla z25.s, p3/M, z2.s, z17.s\n"
- "fmla z24.s, p3/M, z1.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z18.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z2.s, z17.s\n"
- "fmla z26.s, p3/M, z7.s, z16.s\n"
- "fmla z25.s, p3/M, z6.s, z16.s\n"
- "fmla z23.s, p3/M, z4.s, z16.s\n"
- "fmla z21.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z18.s\n"
- "fmla z28.s, p3/M, z1.s, z17.s\n"
- "fmax z28.s, p3/M, z28.s, z31.s\n"
- "fmin z28.s, p3/M, z28.s, z30.s\n"
- "fmla z27.s, p3/M, z0.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
- "fmla z29.s, p3/M, z6.s, z16.s\n"
- "fmax z29.s, p3/M, z29.s, z31.s\n"
- "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z27.s, p3/M, z3.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
"fmla z21.s, p3/M, z5.s, z18.s\n"
- "fmin z29.s, p3/M, z29.s, z30.s\n"
- "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z0.s, z16.s\n"
- "fmla z22.s, p3/M, z2.s, z17.s\n"
- "ldr x20, [x13, #0x20]\n"
- "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z8.s, z18.s\n"
- "fmla z26.s, p3/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z31.s\n"
- "fmla z27.s, p3/M, z8.s, z17.s\n"
- "fmla z24.s, p3/M, z5.s, z17.s\n"
- "fmax z27.s, p3/M, z27.s, z31.s\n"
- "fmax z25.s, p3/M, z25.s, z31.s\n"
- "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmax z30.s, p3/M, z30.s, z14.s\n"
+ "fmax z26.s, p3/M, z26.s, z14.s\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z14.s\n"
+ "fmax z27.s, p3/M, z27.s, z14.s\n"
+ "st1w { z29.s }, p0, [x10, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x28]\n"
"fmla z21.s, p3/M, z7.s, z16.s\n"
- "fmin z27.s, p3/M, z27.s, z30.s\n"
- "fmin z26.s, p3/M, z26.s, z30.s\n"
- "fmla z22.s, p3/M, z6.s, z16.s\n"
- "fmin z25.s, p3/M, z25.s, z30.s\n"
- "fmax z24.s, p3/M, z24.s, z31.s\n"
- "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z31.s\n"
- "fmax z21.s, p3/M, z21.s, z31.s\n"
- "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
- "ldr x23, [x13, #0x28]\n"
- "fmax z22.s, p3/M, z22.s, z31.s\n"
- "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
- "ldr x22, [x13, #0x30]\n"
- "ldr x21, [x13, #0x38]\n"
- "ldr x20, [x13, #0x40]\n"
- "fmin z24.s, p3/M, z24.s, z30.s\n"
- "fmin z23.s, p3/M, z23.s, z30.s\n"
- "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z30.s\n"
- "fmin z22.s, p3/M, z22.s, z30.s\n"
- "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
- "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
- "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
+ "fmin z30.s, p3/M, z30.s, z31.s\n"
+ "fmin z26.s, p3/M, z26.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z14.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z30.s }, p0, [x11, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x20]\n"
+ "fmax z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z28.s }, p0, [x28, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x30]\n"
+ "fmin z25.s, p3/M, z25.s, z31.s\n"
+ "fmax z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z27.s }, p0, [x27, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x38]\n"
+ "st1w { z26.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z31.s\n"
+ "fmin z21.s, p3/M, z21.s, z31.s\n"
+ "st1w { z25.s }, p0, [x23, x13, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z31.s\n"
+ "st1w { z24.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z21.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z23.s }, p0, [x20, x13, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
index 35445595f8..43d5b16dfb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,565 +88,565 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x16, #0x0\n"
- "mov x4, #0x0\n"
+ "mov x1, #0x0\n"
+ "mov x2, #0x0\n"
"1:" // Tile loop
- "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x20, #0x4\n"
"mov x25, #0x4\n"
- "mov x24, #0x4\n"
- "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
- "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
- "add x7, x5, x5\n"
- "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
- "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
- "cntw x16\n"
- "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
- "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x14, x7, x5\n"
+ "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "cntw x3\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
- "add x8, x8, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x13, x8, x23, LSL #2\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "add x12, x13, x23, LSL #2\n"
- "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "add x11, x12, x23, LSL #2\n"
- "add x10, x14, x5\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "add x9, x15, x22, LSL #2\n"
- "add x28, x11, x23, LSL #2\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "add x27, x10, x5\n"
- "add x26, x9, x22, LSL #2\n"
- "add x25, x6, x6\n"
- "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "cmp x16, %x[n_channels]\n"
- "add x24, x28, x23, LSL #2\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "add x23, x26, x22, LSL #2\n"
- "add x22, x25, x6\n"
- "ld1w { z9.s }, p2/Z, [x12, x7, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x8]\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x16\n"
- "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
- "addvl x17, x17, #-6\n"
+ "mov x6, #0x0\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x1, x24\n" // offset = tile_i * ld_input_row
+ "mul x21, x1, x23\n" // offset = tile_i * ld_output_row
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cmp x3, %x[n_channels]\n"
+ "ld1rw { z27.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "add x16, x4, x4\n"
+ "add x15, x5, x5\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "madd x22, x2, x4, x22\n" // offset += tile_j * ld_input_col
+ "add x14, x16, x4\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "add x13, x15, x5\n"
+ "madd x21, x2, x5, x21\n" // offset += tile_j * ld_output_col
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "add x12, x14, x4\n"
+ "mul x22, x22, x20\n" // offset *= kernel_stride * output_size
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "add x11, x12, x4\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "sub x20, XZR, x3\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x7, x7, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x10, x7, x24, LSL #2\n"
+ "add x9, x10, x24, LSL #2\n"
+ "ld1w { z10.s }, p2/Z, [x7]\n"
+ "ld1w { z11.s }, p2/Z, [x7, x11, LSL #2]\n"
+ "add x28, x9, x24, LSL #2\n"
+ "add x27, x28, x24, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "add x17, x17, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x26, x27, x24, LSL #2\n"
+ "ld1w { z9.s }, p2/Z, [x9, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x9, x14, LSL #2]\n"
+ "add x25, x17, x23, LSL #2\n"
+ "add x24, x25, x23, LSL #2\n"
+ "add x23, x24, x23, LSL #2\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "whilelt p1.s, x16, %x[n_channels]\n"
- "incw x21\n"
- "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
- "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
- "incw x16\n"
+ "movprfx z14, z13\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x3, %x[n_channels]\n"
+ "incw x6\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z3.s, z9.s\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z1.s, z9.s\n"
+ "incw x3\n"
"mov p0.b, p2.b\n"
- "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "movprfx z15, z13\n fmla z15.s, p3/M, z0.s, z9.s\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z7.s, z9.s\n"
"incw x20\n"
- "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
- "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
- "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
- "ld1w { z29.s }, p2/Z, [x24]\n"
- "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z21.s, p3/M, z4.s, z12.s\n"
- "fmla z22.s, p3/M, z2.s, z12.s\n"
- "fmla z20.s, p3/M, z1.s, z12.s\n"
- "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
- "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z5.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z19.s, p3/M, z0.s, z10.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z15.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z6.s, z11.s\n"
"fmla z14.s, p3/M, z7.s, z9.s\n"
- "fmla z13.s, p3/M, z8.s, z12.s\n"
- "fmla z17.s, p3/M, z7.s, z12.s\n"
- "fmla z30.s, p3/M, z6.s, z12.s\n"
- "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
- "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
- "fmla z21.s, p3/M, z6.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z22.s, p3/M, z4.s, z9.s\n"
- "fmla z20.s, p3/M, z3.s, z9.s\n"
- "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
- "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
- "ld1w { z19.s }, p3/Z, [x17]\n"
- "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x12, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z9.s\n"
+ "fmla z15.s, p3/M, z3.s, z9.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z1.s, z9.s\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "fmla z21.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x10]\n"
+ "fmla z19.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z22.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27]\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z15.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "fmla z16.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmla z20.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x10, x16, LSL #2]\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x11, LSL #2]\n"
+ "fmla z19.s, p3/M, z3.s, z9.s\n"
+ "fmla z14.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "fmla z17.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x26, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "fmla z14.s, p3/M, z2.s, z12.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z22.s, p3/M, z3.s, z12.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z16.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "fmla z14.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
+ "fmla z22.s, p3/M, z7.s, z9.s\n"
"fmla z18.s, p3/M, z5.s, z9.s\n"
- "fmla z23.s, p3/M, z2.s, z9.s\n"
- "fmla z14.s, p3/M, z8.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z13.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z11.s\n"
- "fmla z30.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28]\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z15.s, p3/M, z2.s, z9.s\n"
+ "fmla z17.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x14, LSL #2]\n"
+ "addvl x7, x7, #1\n"
"fmla z21.s, p3/M, z7.s, z10.s\n"
- "fmla z26.s, p3/M, z6.s, z10.s\n"
- "fmla z22.s, p3/M, z5.s, z10.s\n"
- "fmla z20.s, p3/M, z4.s, z10.s\n"
- "fmla z28.s, p3/M, z3.s, z10.s\n"
- "fmla z25.s, p3/M, z2.s, z10.s\n"
- "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z14.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z19.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x9]\n"
+ "fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z15.s, p3/M, z5.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z10.s\n"
"fmla z24.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z27.s, p3/M, z0.s, z9.s\n"
- "fmla z18.s, p3/M, z6.s, z11.s\n"
- "fmla z23.s, p3/M, z3.s, z11.s\n"
- "fmla z14.s, p3/M, z1.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "fmla z30.s, p3/M, z5.s, z12.s\n"
- "fmla z26.s, p3/M, z2.s, z12.s\n"
- "fmla z13.s, p3/M, z4.s, z10.s\n"
- "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z17.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z0.s, z10.s\n"
- "fmla z28.s, p3/M, z8.s, z11.s\n"
- "fmla z24.s, p3/M, z5.s, z11.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z27.s, p3/M, z2.s, z10.s\n"
- "fmla z14.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z5.s, z10.s\n"
- "fmla z13.s, p3/M, z5.s, z9.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
- "fmla z17.s, p3/M, z4.s, z9.s\n"
- "fmla z30.s, p3/M, z3.s, z9.s\n"
- "fmla z21.s, p3/M, z1.s, z9.s\n"
- "fmla z26.s, p3/M, z0.s, z9.s\n"
- "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z12.s\n"
- "fmla z25.s, p3/M, z6.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z27.s, p3/M, z4.s, z11.s\n"
- "fmla z14.s, p3/M, z3.s, z11.s\n"
- "fmla z18.s, p3/M, z1.s, z11.s\n"
- "fmla z22.s, p3/M, z0.s, z11.s\n"
- "fmla z31.s, p3/M, z7.s, z11.s\n"
- "fmla z13.s, p3/M, z6.s, z11.s\n"
- "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
- "fmla z29.s, p3/M, z8.s, z12.s\n"
- "fmla z24.s, p3/M, z7.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
- "fmla z17.s, p3/M, z8.s, z10.s\n"
- "fmla z30.s, p3/M, z7.s, z10.s\n"
- "fmla z21.s, p3/M, z5.s, z10.s\n"
- "fmla z26.s, p3/M, z4.s, z10.s\n"
- "fmla z20.s, p3/M, z2.s, z10.s\n"
- "fmla z28.s, p3/M, z1.s, z10.s\n"
- "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
- "addvl x8, x8, #1\n"
- "fmla z27.s, p3/M, z7.s, z12.s\n"
- "fmla z14.s, p3/M, z6.s, z12.s\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmla z22.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z1.s, z12.s\n"
- "fmla z25.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z9.s\n"
- "fmla z13.s, p3/M, z1.s, z9.s\n"
- "fmla z17.s, p3/M, z0.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x12]\n"
- "fmla z29.s, p3/M, z2.s, z12.s\n"
- "fmla z30.s, p3/M, z0.s, z11.s\n"
- "fmla z27.s, p3/M, z3.s, z9.s\n"
- "fmla z18.s, p3/M, z0.s, z9.s\n"
- "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, x11, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z19.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28]\n"
+ "fmla z16.s, p3/M, z4.s, z12.s\n"
+ "fmla z23.s, p3/M, z3.s, z12.s\n"
"fmla z26.s, p3/M, z7.s, z12.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z13.s, p3/M, z2.s, z11.s\n"
- "fmla z17.s, p3/M, z1.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
- "addvl x12, x12, #1\n"
- "fmla z31.s, p3/M, z6.s, z9.s\n"
- "ld1w { z12.s }, p2/Z, [x11]\n"
- "fmla z25.s, p3/M, z4.s, z10.s\n"
- "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
- "fmla z29.s, p3/M, z3.s, z10.s\n"
- "fmla z30.s, p3/M, z8.s, z11.s\n"
- "fmla z26.s, p3/M, z5.s, z11.s\n"
- "fmla z28.s, p3/M, z2.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
- "addvl x11, x11, #1\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "fmla z18.s, p3/M, z3.s, z12.s\n"
- "fmla z23.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z11.s\n"
- "fmla z25.s, p3/M, z7.s, z12.s\n"
- "fmla z29.s, p3/M, z6.s, z12.s\n"
- "fmla z18.s, p3/M, z8.s, z10.s\n"
- "fmla z22.s, p3/M, z7.s, z10.s\n"
- "fmla z20.s, p3/M, z6.s, z10.s\n"
- "fmla z23.s, p3/M, z5.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z5.s, z11.s\n"
- "fmla z25.s, p3/M, z5.s, z10.s\n"
- "fmla z29.s, p3/M, z4.s, z10.s\n"
- "fmla z24.s, p3/M, z3.s, z10.s\n"
- "fmla z26.s, p3/M, z8.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z10.s\n"
- "fmla z20.s, p3/M, z7.s, z10.s\n"
- "addvl x24, x24, #1\n"
- "fmla z28.s, p3/M, z6.s, z10.s\n"
- "fmla z25.s, p3/M, z8.s, z11.s\n"
- "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
- "addvl x13, x13, #1\n"
- "fmla z29.s, p3/M, z7.s, z11.s\n"
- "fmla z24.s, p3/M, z6.s, z11.s\n"
- "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z12.s\n"
- "fmla z13.s, p3/M, z3.s, z12.s\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "fmax z13.s, p3/M, z13.s, z15.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "fmla z14.s, p3/M, z0.s, z12.s\n"
- "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "fmla z17.s, p3/M, z5.s, z10.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmla z21.s, p3/M, z2.s, z10.s\n"
- "fmla z26.s, p3/M, z1.s, z10.s\n"
- "fmax z14.s, p3/M, z14.s, z15.s\n"
- "fmax z21.s, p3/M, z21.s, z15.s\n"
- "fmla z18.s, p3/M, z7.s, z11.s\n"
- "fmla z22.s, p3/M, z6.s, z11.s\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "fmla z23.s, p3/M, z4.s, z11.s\n"
- "fmla z25.s, p3/M, z3.s, z11.s\n"
- "fmax z22.s, p3/M, z22.s, z15.s\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "fmla z20.s, p3/M, z8.s, z0.s\n"
- "fmla z28.s, p3/M, z7.s, z0.s\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmla z29.s, p3/M, z5.s, z0.s\n"
- "fmla z24.s, p3/M, z4.s, z0.s\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
- "cmp x16, %x[n_channels]\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
- "fmin z13.s, p3/M, z13.s, z16.s\n"
- "fmin z17.s, p3/M, z17.s, z16.s\n"
- "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
- "addvl x17, x17, #16\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "ld1w { z10.s }, p1/Z, [x8]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z14.s, p3/M, z14.s, z16.s\n"
- "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "st1w { z31.s }, p0, [x15]\n"
- "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
- "fmin z18.s, p3/M, z18.s, z16.s\n"
- "fmin z22.s, p3/M, z22.s, z16.s\n"
- "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
- "fmin z20.s, p3/M, z20.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
- "fmin z29.s, p3/M, z29.s, z16.s\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "st1w { z27.s }, p0, [x9]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z9.s }, p1/Z, [x9, x16, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x11, LSL #2]\n"
"addvl x28, x28, #1\n"
- "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
- "addvl x15, x15, #1\n"
- "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
- "addvl x17, x17, #-6\n"
- "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
- "addvl x9, x9, #1\n"
- "st1w { z18.s }, p0, [x26]\n"
- "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
+ "fmla z23.s, p3/M, z6.s, z10.s\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x14, LSL #2]\n"
"addvl x26, x26, #1\n"
- "st1w { z23.s }, p0, [x23]\n"
- "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z12.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z16.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "fmla z26.s, p3/M, z8.s, z12.s\n"
+ "fmla z15.s, p3/M, z7.s, z12.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x10, x12, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x10, x4, LSL #2]\n"
+ "addvl x10, x10, #1\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "fmla z20.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z10.s\n"
+ "fmla z30.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z14.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x12, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "fmla z18.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z26.s, p3/M, z6.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z27.s\n"
+ "fmax z22.s, p3/M, z22.s, z27.s\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
+ "fmla z16.s, p3/M, z3.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z27.s\n"
+ "fmax z30.s, p3/M, z30.s, z27.s\n"
+ "fmla z15.s, p3/M, z8.s, z10.s\n"
+ "fmla z17.s, p3/M, z7.s, z10.s\n"
+ "fmax z21.s, p3/M, z21.s, z27.s\n"
+ "fmax z14.s, p3/M, z14.s, z27.s\n"
+ "fmla z23.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmax z18.s, p3/M, z18.s, z27.s\n"
+ "fmax z31.s, p3/M, z31.s, z27.s\n"
+ "fmax z24.s, p3/M, z24.s, z27.s\n"
+ "fmax z26.s, p3/M, z26.s, z27.s\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "fmax z25.s, p3/M, z25.s, z27.s\n"
+ "fmax z16.s, p3/M, z16.s, z27.s\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "fmax z15.s, p3/M, z15.s, z27.s\n"
+ "fmax z17.s, p3/M, z17.s, z27.s\n"
+ "ld1w { z10.s }, p1/Z, [x7]\n"
+ "ld1w { z11.s }, p1/Z, [x7, x11, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z27.s\n"
+ "fmax z20.s, p3/M, z20.s, z27.s\n"
+ "ld1w { z12.s }, p1/Z, [x9, x14, LSL #2]\n"
+ "addvl x8, x8, #16\n"
+ "whilelt p2.s, x6, %x[n_channels]\n"
+ "cmp x3, %x[n_channels]\n"
+ "fmin z19.s, p3/M, z19.s, z29.s\n"
+ "fmin z30.s, p3/M, z30.s, z29.s\n"
+ "fmin z28.s, p3/M, z28.s, z29.s\n"
+ "fmin z22.s, p3/M, z22.s, z29.s\n"
+ "fmin z21.s, p3/M, z21.s, z29.s\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "fmin z14.s, p3/M, z14.s, z29.s\n"
+ "fmin z18.s, p3/M, z18.s, z29.s\n"
+ "st1w { z19.s }, p0, [x17]\n"
+ "fmin z31.s, p3/M, z31.s, z29.s\n"
+ "fmin z24.s, p3/M, z24.s, z29.s\n"
+ "st1w { z30.s }, p0, [x17, x5, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z29.s\n"
+ "fmin z15.s, p3/M, z15.s, z29.s\n"
+ "st1w { z28.s }, p0, [x17, x15, LSL #2]\n"
+ "fmin z17.s, p3/M, z17.s, z29.s\n"
+ "fmin z25.s, p3/M, z25.s, z29.s\n"
+ "st1w { z22.s }, p0, [x17, x13, LSL #2]\n"
+ "fmin z16.s, p3/M, z16.s, z29.s\n"
+ "fmin z23.s, p3/M, z23.s, z29.s\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "fmin z20.s, p3/M, z20.s, z29.s\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z14.s }, p0, [x25, x5, LSL #2]\n"
+ "st1w { z18.s }, p0, [x25, x15, LSL #2]\n"
+ "addvl x17, x17, #1\n"
+ "addvl x8, x8, #-6\n"
+ "st1w { z31.s }, p0, [x25, x13, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z26.s }, p0, [x24, x5, LSL #2]\n"
+ "st1w { z15.s }, p0, [x24, x15, LSL #2]\n"
+ "st1w { z17.s }, p0, [x24, x13, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z25.s }, p0, [x23]\n"
+ "st1w { z16.s }, p0, [x23, x5, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x15, LSL #2]\n"
+ "st1w { z20.s }, p0, [x23, x13, LSL #2]\n"
"addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
- "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
- "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
- "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
- "add x4, x4, #0x1\n"
- "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
- "fmla z14.s, p3/M, z5.s, z12.s\n"
- "cmp x4, x20\n"
- "add x21, x16, #0x1\n"
- "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
- "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "csel x16, x16, x21, LT\n"
- "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
- "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
- "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "movprfx z14, z13\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z8.s, z9.s\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z7.s, z9.s\n"
"mov p0.b, p2.b\n"
- "fmla z31.s, p3/M, z0.s, z10.s\n"
- "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
- "ld1w { z29.s }, p2/Z, [x24]\n"
- "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
- "fmla z30.s, p3/M, z4.s, z12.s\n"
- "fmla z13.s, p3/M, z2.s, z12.s\n"
- "csel x4, x4, XZR, LT\n"
- "cmp x16, x20\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z6.s, z9.s\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "add x2, x2, #0x1\n"
+ "add x20, x1, #0x1\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z15.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "cmp x2, x22\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "movprfx z9, z13\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "ld1w { z24.s }, p2/Z, [x26, x11, LSL #2]\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "csel x1, x1, x20, LT\n"
+ "csel x2, x2, XZR, LT\n"
"fmla z20.s, p3/M, z1.s, z12.s\n"
- "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
- "fmla z14.s, p3/M, z7.s, z9.s\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmla z28.s, p3/M, z7.s, z12.s\n"
- "fmla z27.s, p3/M, z6.s, z12.s\n"
- "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
- "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
- "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
- "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
- "fmla z30.s, p3/M, z6.s, z9.s\n"
- "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
- "fmla z13.s, p3/M, z4.s, z9.s\n"
- "fmla z20.s, p3/M, z3.s, z9.s\n"
- "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
- "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
- "fmla z17.s, p3/M, z8.s, z9.s\n"
- "fmla z26.s, p3/M, z5.s, z9.s\n"
- "fmla z10.s, p3/M, z2.s, z9.s\n"
- "fmla z14.s, p3/M, z8.s, z29.s\n"
- "ld1w { z9.s }, p2/Z, [x13]\n"
- "fmla z31.s, p3/M, z1.s, z22.s\n"
- "fmla z18.s, p3/M, z0.s, z22.s\n"
- "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
- "fmla z28.s, p3/M, z2.s, z21.s\n"
- "fmla z27.s, p3/M, z1.s, z21.s\n"
- "ld1w { z19.s }, p2/Z, [x28]\n"
- "fmla z30.s, p3/M, z7.s, z29.s\n"
- "fmla z11.s, p3/M, z6.s, z29.s\n"
- "fmla z13.s, p3/M, z5.s, z29.s\n"
- "fmla z20.s, p3/M, z4.s, z29.s\n"
- "fmla z25.s, p3/M, z3.s, z29.s\n"
- "fmla z12.s, p3/M, z2.s, z29.s\n"
- "fmla z23.s, p3/M, z1.s, z29.s\n"
- "fmla z24.s, p3/M, z0.s, z29.s\n"
- "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
- "fmla z17.s, p3/M, z0.s, z9.s\n"
- "fmla z26.s, p3/M, z6.s, z19.s\n"
- "fmla z10.s, p3/M, z3.s, z19.s\n"
- "fmla z14.s, p3/M, z1.s, z21.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z9.s\n"
- "fmla z27.s, p3/M, z5.s, z22.s\n"
- "fmla z11.s, p3/M, z2.s, z22.s\n"
- "fmla z18.s, p3/M, z4.s, z21.s\n"
- "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
- "fmla z28.s, p3/M, z3.s, z21.s\n"
- "fmla z30.s, p3/M, z0.s, z21.s\n"
- "fmla z25.s, p3/M, z8.s, z19.s\n"
- "fmla z24.s, p3/M, z5.s, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
- "fmla z17.s, p3/M, z2.s, z21.s\n"
- "fmla z14.s, p3/M, z2.s, z29.s\n"
- "fmla z31.s, p3/M, z5.s, z21.s\n"
- "fmla z18.s, p3/M, z5.s, z29.s\n"
- "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
- "fmla z28.s, p3/M, z4.s, z29.s\n"
- "fmla z27.s, p3/M, z3.s, z29.s\n"
- "fmla z30.s, p3/M, z1.s, z29.s\n"
- "fmla z11.s, p3/M, z0.s, z29.s\n"
- "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
- "fmla z10.s, p3/M, z7.s, z19.s\n"
- "fmla z12.s, p3/M, z6.s, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
- "fmla z17.s, p3/M, z4.s, z22.s\n"
- "fmla z14.s, p3/M, z3.s, z22.s\n"
- "fmla z26.s, p3/M, z1.s, z22.s\n"
- "fmla z13.s, p3/M, z0.s, z22.s\n"
- "fmla z31.s, p3/M, z7.s, z22.s\n"
- "fmla z18.s, p3/M, z6.s, z22.s\n"
- "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
- "fmla z23.s, p3/M, z8.s, z19.s\n"
- "fmla z24.s, p3/M, z7.s, z19.s\n"
- "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
- "fmla z28.s, p3/M, z8.s, z21.s\n"
- "fmla z27.s, p3/M, z7.s, z21.s\n"
- "fmla z30.s, p3/M, z5.s, z21.s\n"
- "fmla z11.s, p3/M, z4.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z14.s, p3/M, z7.s, z15.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z19.s, p3/M, z7.s, z12.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "cmp x1, x21\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z11, z13\n fmla z11.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z12, z13\n fmla z12.s, p3/M, z8.s, z24.s\n"
+ "fmla z23.s, p3/M, z6.s, z15.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x12, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z15.s\n"
+ "fmla z20.s, p3/M, z3.s, z15.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z1.s, z15.s\n"
+ "fmla z13.s, p3/M, z0.s, z15.s\n"
+ "fmla z26.s, p3/M, z8.s, z15.s\n"
+ "fmla z28.s, p3/M, z5.s, z15.s\n"
+ "fmla z22.s, p3/M, z2.s, z15.s\n"
+ "fmla z14.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z15.s }, p2/Z, [x10]\n"
+ "fmla z18.s, p3/M, z1.s, z16.s\n"
+ "fmla z25.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z21.s }, p2/Z, [x10, x11, LSL #2]\n"
+ "fmla z19.s, p3/M, z2.s, z17.s\n"
+ "fmla z9.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x27]\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z11.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z13.s, p3/M, z1.s, z10.s\n"
+ "fmla z12.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z17.s }, p2/Z, [x10, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z0.s, z15.s\n"
+ "fmla z28.s, p3/M, z6.s, z16.s\n"
+ "fmla z22.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z15.s\n"
+ "fmla z14.s, p3/M, z1.s, z17.s\n"
+ "fmla z9.s, p3/M, z5.s, z21.s\n"
+ "fmla z31.s, p3/M, z2.s, z21.s\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x10, x14, LSL #2]\n"
+ "fmla z19.s, p3/M, z3.s, z17.s\n"
+ "fmla z23.s, p3/M, z0.s, z17.s\n"
+ "fmla z11.s, p3/M, z8.s, z16.s\n"
+ "fmla z12.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z17.s\n"
+ "fmla z14.s, p3/M, z2.s, z21.s\n"
+ "fmla z18.s, p3/M, z5.s, z17.s\n"
+ "fmla z25.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x4, LSL #2]\n"
+ "fmla z19.s, p3/M, z4.s, z21.s\n"
+ "fmla z9.s, p3/M, z3.s, z21.s\n"
+ "fmla z23.s, p3/M, z1.s, z21.s\n"
+ "fmla z31.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "fmla z24.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "fmla z14.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "fmla z18.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z8.s, z16.s\n"
+ "fmla z12.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x4, LSL #2]\n"
+ "fmla z19.s, p3/M, z8.s, z21.s\n"
+ "fmla z9.s, p3/M, z7.s, z21.s\n"
+ "fmla z23.s, p3/M, z5.s, z21.s\n"
+ "fmla z31.s, p3/M, z4.s, z21.s\n"
"fmla z20.s, p3/M, z2.s, z21.s\n"
- "fmla z25.s, p3/M, z1.s, z21.s\n"
- "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
- "fmla z17.s, p3/M, z7.s, z19.s\n"
- "fmla z14.s, p3/M, z6.s, z19.s\n"
- "fmla z26.s, p3/M, z4.s, z19.s\n"
- "fmla z13.s, p3/M, z3.s, z19.s\n"
- "fmla z10.s, p3/M, z1.s, z19.s\n"
- "fmla z12.s, p3/M, z0.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
- "fmla z31.s, p3/M, z2.s, z29.s\n"
- "fmla z18.s, p3/M, z1.s, z29.s\n"
- "fmla z28.s, p3/M, z0.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x12]\n"
- "fmla z23.s, p3/M, z2.s, z21.s\n"
- "fmla z27.s, p3/M, z0.s, z22.s\n"
- "fmla z17.s, p3/M, z3.s, z29.s\n"
- "fmla z26.s, p3/M, z0.s, z29.s\n"
- "fmla z30.s, p3/M, z8.s, z21.s\n"
- "fmla z11.s, p3/M, z7.s, z21.s\n"
- "fmla z20.s, p3/M, z5.s, z21.s\n"
- "fmla z25.s, p3/M, z4.s, z21.s\n"
- "fmla z24.s, p3/M, z1.s, z21.s\n"
- "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
- "fmla z18.s, p3/M, z2.s, z22.s\n"
- "fmla z28.s, p3/M, z1.s, z22.s\n"
- "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
- "fmla z31.s, p3/M, z6.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x11]\n"
- "fmla z12.s, p3/M, z4.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z19.s\n"
- "fmla z27.s, p3/M, z8.s, z21.s\n"
- "fmla z11.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x7, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z14.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z24.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z1.s, z17.s\n"
+ "fmla z19.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "fmla z9.s, p3/M, z0.s, z21.s\n"
+ "fmla z13.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z20.s, p3/M, z5.s, z16.s\n"
+ "fmla z26.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z11.s, p3/M, z4.s, z16.s\n"
+ "fmla z12.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x27, x16, LSL #2]\n"
"fmla z25.s, p3/M, z2.s, z21.s\n"
- "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
- "fmla z17.s, p3/M, z6.s, z29.s\n"
- "fmla z26.s, p3/M, z3.s, z29.s\n"
- "fmla z10.s, p3/M, z0.s, z29.s\n"
- "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
- "fmla z24.s, p3/M, z2.s, z9.s\n"
- "fmla z12.s, p3/M, z7.s, z22.s\n"
- "fmla z23.s, p3/M, z6.s, z22.s\n"
- "fmla z26.s, p3/M, z8.s, z19.s\n"
- "fmla z13.s, p3/M, z7.s, z19.s\n"
- "fmla z20.s, p3/M, z6.s, z19.s\n"
- "fmla z10.s, p3/M, z5.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
- "fmla z25.s, p3/M, z5.s, z9.s\n"
- "fmla z12.s, p3/M, z5.s, z21.s\n"
- "fmla z23.s, p3/M, z4.s, z21.s\n"
- "fmla z24.s, p3/M, z3.s, z21.s\n"
- "fmla z11.s, p3/M, z8.s, z9.s\n"
- "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
- "fmla z10.s, p3/M, z8.s, z22.s\n"
- "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
- "fmla z13.s, p3/M, z8.s, z21.s\n"
- "fmla z20.s, p3/M, z7.s, z21.s\n"
- "fmla z25.s, p3/M, z6.s, z21.s\n"
- "fmla z12.s, p3/M, z8.s, z19.s\n"
- "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
- "fmla z23.s, p3/M, z7.s, z19.s\n"
- "fmla z24.s, p3/M, z6.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
- "fmla z31.s, p3/M, z4.s, z22.s\n"
- "fmla z18.s, p3/M, z3.s, z22.s\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
- "fmax z18.s, p3/M, z18.s, z15.s\n"
- "fmla z17.s, p3/M, z1.s, z22.s\n"
- "fmla z14.s, p3/M, z0.s, z22.s\n"
- "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
- "fmax z17.s, p3/M, z17.s, z15.s\n"
- "fmla z28.s, p3/M, z5.s, z29.s\n"
- "fmla z27.s, p3/M, z4.s, z29.s\n"
- "fmax z28.s, p3/M, z28.s, z15.s\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
- "fmla z30.s, p3/M, z2.s, z29.s\n"
- "fmla z11.s, p3/M, z1.s, z29.s\n"
- "fmax z14.s, p3/M, z14.s, z15.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmla z26.s, p3/M, z7.s, z21.s\n"
+ "fmla z19.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z13.s, p3/M, z3.s, z15.s\n"
+ "fmla z30.s, p3/M, z7.s, z15.s\n"
+ "fmla z9.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "fmla z11.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z15.s\n"
+ "fmla z12.s, p3/M, z2.s, z16.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "fmla z24.s, p3/M, z7.s, z21.s\n"
"fmla z13.s, p3/M, z6.s, z21.s\n"
- "fmax z11.s, p3/M, z11.s, z15.s\n"
- "fmax z26.s, p3/M, z26.s, z15.s\n"
- "fmla z10.s, p3/M, z4.s, z21.s\n"
- "fmla z12.s, p3/M, z3.s, z21.s\n"
- "fmax z13.s, p3/M, z13.s, z15.s\n"
- "fmax z10.s, p3/M, z10.s, z15.s\n"
- "fmla z20.s, p3/M, z8.s, z9.s\n"
- "fmla z25.s, p3/M, z7.s, z9.s\n"
- "fmax z20.s, p3/M, z20.s, z15.s\n"
- "fmax z25.s, p3/M, z25.s, z15.s\n"
- "fmla z23.s, p3/M, z5.s, z9.s\n"
- "fmla z24.s, p3/M, z4.s, z9.s\n"
- "fmax z12.s, p3/M, z12.s, z15.s\n"
- "fmax z23.s, p3/M, z23.s, z15.s\n"
- "fmax z24.s, p3/M, z24.s, z15.s\n"
- "fmin z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z31.s }, p0, [x15]\n"
- "fmin z18.s, p3/M, z18.s, z16.s\n"
- "fmin z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z16.s\n"
- "fmin z17.s, p3/M, z17.s, z16.s\n"
- "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
- "fmin z14.s, p3/M, z14.s, z16.s\n"
- "fmin z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
- "fmin z11.s, p3/M, z11.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z16.s\n"
- "st1w { z17.s }, p0, [x9]\n"
- "fmin z13.s, p3/M, z13.s, z16.s\n"
- "fmin z20.s, p3/M, z20.s, z16.s\n"
- "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
- "fmin z25.s, p3/M, z25.s, z16.s\n"
- "fmin z10.s, p3/M, z10.s, z16.s\n"
- "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
- "fmin z12.s, p3/M, z12.s, z16.s\n"
- "fmin z23.s, p3/M, z23.s, z16.s\n"
- "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z16.s\n"
- "st1w { z26.s }, p0, [x26]\n"
- "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
- "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
- "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
- "st1w { z10.s }, p0, [x23]\n"
- "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
- "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
- "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+ "fmla z11.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z15.s\n"
+ "fmla z22.s, p3/M, z5.s, z15.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z16.s\n"
+ "fmla z13.s, p3/M, z4.s, z16.s\n"
+ "fmla z12.s, p3/M, z3.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmla z11.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z15.s }, p2/Z, [x10, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z13.s, p3/M, z7.s, z17.s\n"
+ "fmla z12.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x4, LSL #2]\n"
+ "fmla z19.s, p3/M, z5.s, z15.s\n"
+ "fmla z9.s, p3/M, z4.s, z15.s\n"
+ "fmla z18.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z3.s, z16.s\n"
+ "fmla z26.s, p3/M, z1.s, z16.s\n"
+ "fmla z14.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x12, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z15.s\n"
+ "fmla z31.s, p3/M, z1.s, z15.s\n"
+ "fmla z28.s, p3/M, z7.s, z17.s\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmax z19.s, p3/M, z19.s, z27.s\n"
+ "fmax z9.s, p3/M, z9.s, z27.s\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "fmax z18.s, p3/M, z18.s, z27.s\n"
+ "fmax z25.s, p3/M, z25.s, z27.s\n"
+ "fmla z20.s, p3/M, z8.s, z16.s\n"
+ "fmla z11.s, p3/M, z7.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z27.s\n"
+ "fmax z14.s, p3/M, z14.s, z27.s\n"
+ "fmla z13.s, p3/M, z5.s, z16.s\n"
+ "fmla z12.s, p3/M, z4.s, z16.s\n"
+ "fmax z23.s, p3/M, z23.s, z27.s\n"
+ "fmax z31.s, p3/M, z31.s, z27.s\n"
+ "fmax z28.s, p3/M, z28.s, z27.s\n"
+ "fmax z30.s, p3/M, z30.s, z27.s\n"
+ "fmax z22.s, p3/M, z22.s, z27.s\n"
+ "fmax z24.s, p3/M, z24.s, z27.s\n"
+ "fmax z20.s, p3/M, z20.s, z27.s\n"
+ "fmax z11.s, p3/M, z11.s, z27.s\n"
+ "fmax z13.s, p3/M, z13.s, z27.s\n"
+ "fmax z12.s, p3/M, z12.s, z27.s\n"
+ "fmin z18.s, p3/M, z18.s, z29.s\n"
+ "fmin z25.s, p3/M, z25.s, z29.s\n"
+ "fmin z19.s, p3/M, z19.s, z29.s\n"
+ "fmin z9.s, p3/M, z9.s, z29.s\n"
+ "fmin z26.s, p3/M, z26.s, z29.s\n"
+ "fmin z14.s, p3/M, z14.s, z29.s\n"
+ "fmin z23.s, p3/M, z23.s, z29.s\n"
+ "fmin z31.s, p3/M, z31.s, z29.s\n"
+ "st1w { z18.s }, p0, [x17]\n"
+ "fmin z28.s, p3/M, z28.s, z29.s\n"
+ "fmin z30.s, p3/M, z30.s, z29.s\n"
+ "st1w { z25.s }, p0, [x17, x5, LSL #2]\n"
+ "fmin z20.s, p3/M, z20.s, z29.s\n"
+ "fmin z11.s, p3/M, z11.s, z29.s\n"
+ "st1w { z19.s }, p0, [x17, x15, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z29.s\n"
+ "fmin z24.s, p3/M, z24.s, z29.s\n"
+ "st1w { z9.s }, p0, [x17, x13, LSL #2]\n"
+ "fmin z13.s, p3/M, z13.s, z29.s\n"
+ "fmin z12.s, p3/M, z12.s, z29.s\n"
+ "st1w { z26.s }, p0, [x25]\n"
+ "st1w { z14.s }, p0, [x25, x5, LSL #2]\n"
+ "st1w { z23.s }, p0, [x25, x15, LSL #2]\n"
+ "st1w { z31.s }, p0, [x25, x13, LSL #2]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z30.s }, p0, [x24, x5, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x15, LSL #2]\n"
+ "st1w { z11.s }, p0, [x24, x13, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z24.s }, p0, [x23, x5, LSL #2]\n"
+ "st1w { z13.s }, p0, [x23, x15, LSL #2]\n"
+ "st1w { z12.s }, p0, [x23, x13, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
index 3db248924f..587f18d90d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -101,607 +101,607 @@ void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
"ptrue p3.b\n"
"ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
"add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ld1w { z17.s }, p3/Z, [x7]\n"
- "cntw x17\n"
- "mov x16, #0x0\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "cntw x16\n"
+ "mov x15, #0x0\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z22.s }, p3/Z, [x7]\n"
"ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
"ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
"ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
"ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
- "cmp x17, %x[n_channels]\n"
"ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "sub x14, XZR, x16\n"
"ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
- "sub x15, XZR, x17\n"
"ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
"addvl x7, x7, #16\n"
- "ldp x23, x22, [x8, #0x0]\n"
- "ldp x21, x20, [x8, #0x10]\n"
- "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
"ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
"addvl x7, x7, #-6\n"
- "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
- "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z18, z22\n fmla z18.s, p3/M, z8.s, z9.s\n"
"ldr x27, [x8, #0x20]\n"
"ldr x24, [x8, #0x30]\n"
- "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z17, z22\n fmla z17.s, p3/M, z3.s, z9.s\n"
+ "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
"ldr x23, [x8, #0x28]\n"
"ldr x22, [x8, #0x38]\n"
- "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
- "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+ "movprfx z21, z22\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z7.s, z9.s\n"
"ldr x26, [x8, #0x40]\n"
- "ldr x21, [x8, #0x48]\n"
- "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
- "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "ldr x20, [x8, #0x48]\n"
+ "movprfx z25, z22\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z5.s, z9.s\n"
"ldr x25, [x8, #0x50]\n"
- "ldr x20, [x8, #0x58]\n"
- "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
- "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
- "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x58]\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "movprfx z23, z22\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x13, [x8, #0x70]\n"
- "fmla z26.s, p3/M, z0.s, z10.s\n"
- "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
- "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
- "fmla z24.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "movprfx z10, z22\n fmla z10.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
"ldr x24, [x8, #0x60]\n"
"ldr x23, [x8, #0x68]\n"
- "fmla z31.s, p3/M, z1.s, z12.s\n"
- "fmla z22.s, p3/M, z8.s, z12.s\n"
- "incw x15\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z12.s\n"
+ "incw x14\n"
"mov p1.b, p2.b\n"
- "fmla z27.s, p3/M, z7.s, z12.s\n"
- "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z12.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
"ldr x28, [x8, #0x88]\n"
- "fmla z20.s, p3/M, z7.s, z25.s\n"
- "fmla z9.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x14, #0x0]\n"
- "ldr x11, [x14, #0x8]\n"
- "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
- "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "fmla z10.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x17, #0x0]\n"
+ "ldr x11, [x17, #0x8]\n"
+ "movprfx z15, z22\n fmla z15.s, p3/M, z3.s, z12.s\n"
+ "movprfx z20, z22\n fmla z20.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
"ldr x22, [x8, #0x78]\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
- "fmla z24.s, p3/M, z6.s, z25.s\n"
- "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
- "ldr x21, [x8, #0x80]\n"
- "fmla z30.s, p3/M, z4.s, z25.s\n"
- "fmla z31.s, p3/M, z3.s, z25.s\n"
- "ldr x10, [x14, #0x10]\n"
- "ldr x9, [x14, #0x18]\n"
- "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
- "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
- "whilelt p0.s, x17, %x[n_channels]\n"
- "ld1w { z17.s }, p3/Z, [x7]\n"
- "fmla z14.s, p3/M, z8.s, z25.s\n"
- "fmla z23.s, p3/M, z5.s, z25.s\n"
- "fmla z15.s, p3/M, z2.s, z25.s\n"
- "fmla z26.s, p3/M, z1.s, z12.s\n"
- "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "movprfx z24, z22\n fmla z24.s, p3/M, z8.s, z27.s\n"
+ "fmla z17.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x20, [x8, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ldr x10, [x17, #0x10]\n"
+ "ldr x9, [x17, #0x18]\n"
+ "movprfx z13, z22\n fmla z13.s, p3/M, z1.s, z16.s\n"
+ "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z16.s\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "ld1w { z22.s }, p3/Z, [x7]\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z16.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
"ldr x27, [x8, #0x90]\n"
- "fmla z22.s, p3/M, z0.s, z12.s\n"
- "fmla z27.s, p3/M, z2.s, z29.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
- "ldr x20, [x8, #0x98]\n"
- "fmla z20.s, p3/M, z8.s, z10.s\n"
- "fmla z9.s, p3/M, z1.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x8, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z9.s\n"
+ "fmla z10.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x26, [x8, #0xa0]\n"
- "fmla z24.s, p3/M, z7.s, z10.s\n"
- "fmla z11.s, p3/M, z6.s, z10.s\n"
- "fmla z30.s, p3/M, z5.s, z10.s\n"
- "fmla z31.s, p3/M, z4.s, z10.s\n"
- "fmla z13.s, p3/M, z3.s, z10.s\n"
- "fmla z18.s, p3/M, z2.s, z10.s\n"
- "fmla z21.s, p3/M, z1.s, z10.s\n"
- "fmla z28.s, p3/M, z0.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z9.s\n"
+ "fmla z15.s, p3/M, z6.s, z9.s\n"
+ "fmla z26.s, p3/M, z5.s, z9.s\n"
+ "fmla z21.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z13.s, p3/M, z2.s, z9.s\n"
+ "fmla z27.s, p3/M, z1.s, z9.s\n"
+ "fmla z24.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
"ldr x25, [x8, #0xa8]\n"
- "fmla z26.s, p3/M, z3.s, z25.s\n"
- "fmla z14.s, p3/M, z0.s, z25.s\n"
- "fmla z23.s, p3/M, z6.s, z29.s\n"
- "fmla z15.s, p3/M, z3.s, z29.s\n"
- "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z3.s, z16.s\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x15, LSL #2]\n"
"ldr x24, [x8, #0xb0]\n"
- "fmla z22.s, p3/M, z4.s, z10.s\n"
- "fmla z27.s, p3/M, z3.s, z10.s\n"
- "fmla z20.s, p3/M, z1.s, z10.s\n"
- "fmla z9.s, p3/M, z5.s, z12.s\n"
- "fmla z11.s, p3/M, z2.s, z12.s\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z29.s, p3/M, z1.s, z9.s\n"
+ "fmla z10.s, p3/M, z5.s, z11.s\n"
+ "fmla z15.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
"ldr x23, [x8, #0xb8]\n"
- "fmla z13.s, p3/M, z8.s, z25.s\n"
- "fmla z28.s, p3/M, z5.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
"ldr x22, [x8, #0xc0]\n"
- "fmla z26.s, p3/M, z5.s, z10.s\n"
- "fmla z14.s, p3/M, z2.s, z10.s\n"
- "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
- "ldr x21, [x8, #0xc8]\n"
- "fmla z22.s, p3/M, z5.s, z12.s\n"
- "fmla z27.s, p3/M, z4.s, z12.s\n"
- "fmla z20.s, p3/M, z2.s, z12.s\n"
- "fmla z9.s, p3/M, z3.s, z12.s\n"
- "fmla z24.s, p3/M, z1.s, z12.s\n"
- "fmla z11.s, p3/M, z0.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z5.s, z9.s\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "ldr x20, [x8, #0xc8]\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z25.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "fmla z10.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "fmla z15.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
"ldr x28, [x8, #0xd8]\n"
- "fmla z15.s, p3/M, z7.s, z25.s\n"
- "fmla z18.s, p3/M, z6.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
- "ldr x20, [x8, #0xd0]\n"
- "fmla z26.s, p3/M, z7.s, z29.s\n"
- "fmla z22.s, p3/M, z6.s, z29.s\n"
- "fmla z14.s, p3/M, z4.s, z29.s\n"
- "fmla z20.s, p3/M, z3.s, z29.s\n"
- "fmla z23.s, p3/M, z1.s, z29.s\n"
- "fmla z30.s, p3/M, z0.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z13.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x21, [x8, #0xd0]\n"
+ "fmla z18.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z4.s, z16.s\n"
+ "fmla z29.s, p3/M, z3.s, z16.s\n"
+ "fmla z23.s, p3/M, z1.s, z16.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x15, LSL #2]\n"
"ldr x27, [x8, #0xe0]\n"
- "fmla z27.s, p3/M, z8.s, z10.s\n"
- "fmla z21.s, p3/M, z8.s, z25.s\n"
- "fmla z28.s, p3/M, z7.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
- "fmla z13.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z8.s, z9.s\n"
+ "fmla z27.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z1.s, z9.s\n"
"ldr x26, [x8, #0xe8]\n"
- "fmla z9.s, p3/M, z7.s, z10.s\n"
- "fmla z24.s, p3/M, z5.s, z10.s\n"
- "fmla z11.s, p3/M, z4.s, z10.s\n"
- "fmla z31.s, p3/M, z2.s, z10.s\n"
- "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z5.s, z9.s\n"
+ "fmla z15.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x25, [x8, #0xf0]\n"
- "fmla z26.s, p3/M, z2.s, z29.s\n"
- "fmla z22.s, p3/M, z1.s, z29.s\n"
- "fmla z27.s, p3/M, z0.s, z29.s\n"
- "fmla z14.s, p3/M, z7.s, z25.s\n"
- "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z16.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z25.s, p3/M, z0.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
"ldr x24, [x8, #0xf8]\n"
- "fmla z20.s, p3/M, z6.s, z25.s\n"
- "fmla z23.s, p3/M, z4.s, z25.s\n"
- "fmla z30.s, p3/M, z3.s, z25.s\n"
- "fmla z15.s, p3/M, z1.s, z25.s\n"
- "fmla z18.s, p3/M, z0.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
- "fmla z13.s, p3/M, z4.s, z25.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z26.s, p3/M, z3.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
"ldr x23, [x8, #0x100]\n"
- "fmla z21.s, p3/M, z2.s, z25.s\n"
- "fmla z22.s, p3/M, z2.s, z10.s\n"
- "fmla z27.s, p3/M, z1.s, z10.s\n"
- "fmla z9.s, p3/M, z0.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
- "ldr x22, [x8, #0x108]\n"
- "fmla z26.s, p3/M, z6.s, z29.s\n"
- "fmla z14.s, p3/M, z3.s, z29.s\n"
- "fmla z23.s, p3/M, z0.s, z29.s\n"
- "fmla z24.s, p3/M, z8.s, z25.s\n"
- "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
- "ldr x21, [x8, #0x110]\n"
- "fmla z11.s, p3/M, z7.s, z25.s\n"
- "fmla z31.s, p3/M, z5.s, z25.s\n"
- "fmla z28.s, p3/M, z1.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
- "fmla z13.s, p3/M, z2.s, z12.s\n"
- "ldr x20, [x8, #0x118]\n"
- "fmla z15.s, p3/M, z0.s, z10.s\n"
- "fmla z18.s, p3/M, z4.s, z25.s\n"
- "fmla z21.s, p3/M, z3.s, z25.s\n"
- "fmla z9.s, p3/M, z8.s, z12.s\n"
- "fmla z11.s, p3/M, z5.s, z12.s\n"
- "fmla z14.s, p3/M, z6.s, z10.s\n"
- "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
- "fmla z23.s, p3/M, z3.s, z10.s\n"
- "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z25.s\n"
- "fmla z31.s, p3/M, z6.s, z25.s\n"
- "fmla z15.s, p3/M, z5.s, z25.s\n"
- "fmla z13.s, p3/M, z5.s, z12.s\n"
- "fmla z28.s, p3/M, z2.s, z12.s\n"
- "fmla z18.s, p3/M, z7.s, z29.s\n"
- "fmla z21.s, p3/M, z6.s, z29.s\n"
- "fmla z23.s, p3/M, z8.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
- "fmla z15.s, p3/M, z8.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z25.s\n"
- "fmla z31.s, p3/M, z7.s, z25.s\n"
- "fmla z13.s, p3/M, z6.s, z25.s\n"
- "fmla z18.s, p3/M, z5.s, z25.s\n"
- "fmla z21.s, p3/M, z4.s, z25.s\n"
- "fmla z28.s, p3/M, z3.s, z25.s\n"
- "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
- "ldp x27, x26, [x8, #0x0]\n"
- "fmla z11.s, p3/M, z8.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
- "fmla z26.s, p3/M, z4.s, z29.s\n"
- "fmax z26.s, p3/M, z26.s, z16.s\n"
- "fmla z22.s, p3/M, z3.s, z29.s\n"
- "fmla z27.s, p3/M, z5.s, z25.s\n"
- "fmax z22.s, p3/M, z22.s, z16.s\n"
- "fmax z27.s, p3/M, z27.s, z16.s\n"
- "fmla z9.s, p3/M, z4.s, z25.s\n"
- "fmla z18.s, p3/M, z8.s, z12.s\n"
- "fmax z9.s, p3/M, z9.s, z16.s\n"
- "fmin z26.s, p3/M, z26.s, z19.s\n"
- "fmla z21.s, p3/M, z7.s, z12.s\n"
- "fmla z28.s, p3/M, z6.s, z12.s\n"
- "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
- "fmin z22.s, p3/M, z22.s, z19.s\n"
- "fmla z14.s, p3/M, z1.s, z29.s\n"
- "fmla z20.s, p3/M, z0.s, z29.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
- "fmin z27.s, p3/M, z27.s, z19.s\n"
- "fmla z24.s, p3/M, z2.s, z25.s\n"
- "fmla z11.s, p3/M, z1.s, z25.s\n"
- "fmin z9.s, p3/M, z9.s, z19.s\n"
- "fmax z14.s, p3/M, z14.s, z16.s\n"
- "fmla z23.s, p3/M, z7.s, z10.s\n"
- "fmla z30.s, p3/M, z6.s, z10.s\n"
- "fmax z20.s, p3/M, z20.s, z16.s\n"
- "fmax z24.s, p3/M, z24.s, z16.s\n"
- "fmla z31.s, p3/M, z8.s, z12.s\n"
- "fmla z13.s, p3/M, z7.s, z12.s\n"
- "fmax z11.s, p3/M, z11.s, z16.s\n"
- "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
- "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
- "ldr x23, [x14, #0x20]\n"
- "ldr x22, [x14, #0x28]\n"
- "fmla z15.s, p3/M, z4.s, z10.s\n"
- "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
- "ldr x21, [x14, #0x30]\n"
- "fmla z18.s, p3/M, z3.s, z10.s\n"
- "fmla z21.s, p3/M, z5.s, z12.s\n"
- "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
- "ldr x20, [x14, #0x38]\n"
- "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "fmla z10.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x8, #0x108]\n"
+ "fmla z18.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z17.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x8, #0x110]\n"
+ "fmla z15.s, p3/M, z7.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z9.s\n"
+ "fmla z24.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z10.s, p3/M, z8.s, z11.s\n"
+ "ldr x21, [x8, #0x118]\n"
+ "fmla z20.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "fmla z30.s, p3/M, z6.s, z12.s\n"
+ "fmla z23.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z16.s\n"
+ "fmla z27.s, p3/M, z3.s, z16.s\n"
+ "fmla z15.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z24.s, p3/M, z2.s, z12.s\n"
+ "fmla z13.s, p3/M, z7.s, z9.s\n"
+ "fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z15.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z13.s, p3/M, z5.s, z16.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z9.s\n"
+ "ldp x20, x26, [x8, #0x0]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z10.s, p3/M, z4.s, z11.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmax z18.s, p3/M, z18.s, z19.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z15.s, p3/M, z1.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "fmax z25.s, p3/M, z25.s, z19.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "fmax z10.s, p3/M, z10.s, z19.s\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z26.s, p3/M, z6.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
"ldp x25, x24, [x8, #0x10]\n"
- "fmin z14.s, p3/M, z14.s, z19.s\n"
- "fmin z20.s, p3/M, z20.s, z19.s\n"
- "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
- "ldr x23, [x14, #0x40]\n"
- "fmin z24.s, p3/M, z24.s, z19.s\n"
- "fmin z11.s, p3/M, z11.s, z19.s\n"
- "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
- "ldr x22, [x14, #0x48]\n"
- "fmax z23.s, p3/M, z23.s, z16.s\n"
- "fmax z30.s, p3/M, z30.s, z16.s\n"
- "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
- "ldr x21, [x14, #0x50]\n"
- "fmax z31.s, p3/M, z31.s, z16.s\n"
- "fmax z13.s, p3/M, z13.s, z16.s\n"
- "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
- "ldr x20, [x14, #0x58]\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "fmax z17.s, p3/M, z17.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "fmla z24.s, p3/M, z4.s, z16.s\n"
+ "fmin z10.s, p3/M, z10.s, z14.s\n"
+ "fmax z15.s, p3/M, z15.s, z19.s\n"
+ "st1w { z18.s }, p1, [x12, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x20]\n"
+ "st1w { z28.s }, p1, [x11, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x28]\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "st1w { z25.s }, p1, [x10, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x30]\n"
+ "fmin z17.s, p3/M, z17.s, z14.s\n"
+ "fmax z23.s, p3/M, z23.s, z19.s\n"
+ "st1w { z10.s }, p1, [x9, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x38]\n"
+ "fmin z15.s, p3/M, z15.s, z14.s\n"
+ "fmax z26.s, p3/M, z26.s, z19.s\n"
+ "fmax z21.s, p3/M, z21.s, z19.s\n"
+ "fmax z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z30.s }, p1, [x23, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x40]\n"
+ "st1w { z29.s }, p1, [x22, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x48]\n"
+ "incw x15\n"
+ "ld1w { z10.s }, p0/Z, [x26, x16, LSL #2]\n"
+ "st1w { z17.s }, p1, [x21, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x50]\n"
+ "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "st1w { z15.s }, p1, [x20, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
"incw x16\n"
- "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
- "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
- "fmin z23.s, p3/M, z23.s, z19.s\n"
- "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
- "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
- "incw x17\n"
- "fmin z30.s, p3/M, z30.s, z19.s\n"
- "fmin z31.s, p3/M, z31.s, z19.s\n"
- "fmin z13.s, p3/M, z13.s, z19.s\n"
- "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
- "ldr x23, [x14, #0x60]\n"
- "fmax z15.s, p3/M, z15.s, z16.s\n"
- "fmax z18.s, p3/M, z18.s, z16.s\n"
- "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
- "ldr x22, [x14, #0x68]\n"
- "fmax z21.s, p3/M, z21.s, z16.s\n"
- "fmax z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
- "ldr x21, [x14, #0x70]\n"
- "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
- "ldr x20, [x14, #0x78]\n"
- "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
- "whilelt p2.s, x16, %x[n_channels]\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
"ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
"ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
- "cmp x17, %x[n_channels]\n"
- "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "st1w { z23.s }, p1, [x23, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmax z13.s, p3/M, z13.s, z19.s\n"
+ "fmax z27.s, p3/M, z27.s, z19.s\n"
"ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
"ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
- "fmin z18.s, p3/M, z18.s, z19.s\n"
- "fmin z21.s, p3/M, z21.s, z19.s\n"
+ "fmax z24.s, p3/M, z24.s, z19.s\n"
+ "st1w { z26.s }, p1, [x22, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x68]\n"
"ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x70]\n"
"ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
"addvl x7, x7, #16\n"
- "fmin z28.s, p3/M, z28.s, z19.s\n"
- "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+ "st1w { z20.s }, p1, [x20, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "fmin z13.s, p3/M, z13.s, z14.s\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
"ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
"ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
"addvl x7, x7, #-6\n"
- "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
- "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
- "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
+ "st1w { z31.s }, p1, [x23, x14, LSL #2]\n"
+ "st1w { z13.s }, p1, [x22, x14, LSL #2]\n"
+ "st1w { z27.s }, p1, [x21, x14, LSL #2]\n"
+ "st1w { z24.s }, p1, [x20, x14, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
- "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+ "movprfx z16, z22\n fmla z16.s, p3/M, z4.s, z9.s\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z8.s, z9.s\n"
"ldr x27, [x8, #0x20]\n"
"ldr x24, [x8, #0x30]\n"
- "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
- "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z13, z22\n fmla z13.s, p3/M, z3.s, z9.s\n"
+ "movprfx z15, z22\n fmla z15.s, p3/M, z1.s, z9.s\n"
"ldr x23, [x8, #0x28]\n"
"ldr x22, [x8, #0x38]\n"
- "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
- "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "movprfx z20, z22\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "movprfx z18, z22\n fmla z18.s, p3/M, z7.s, z9.s\n"
"ldr x26, [x8, #0x40]\n"
"ldr x21, [x8, #0x48]\n"
- "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
- "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "movprfx z26, z22\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z5.s, z9.s\n"
"ldr x25, [x8, #0x50]\n"
"ldr x20, [x8, #0x58]\n"
- "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
- "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
- "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z16.s, p3/M, z5.s, z12.s\n"
+ "movprfx z28, z22\n fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z27.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x13, [x8, #0x70]\n"
- "fmla z18.s, p3/M, z0.s, z10.s\n"
- "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
- "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
- "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
- "fmla z15.s, p3/M, z4.s, z12.s\n"
- "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z24.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z12.s\n"
+ "fmla z15.s, p3/M, z2.s, z12.s\n"
"ldr x24, [x8, #0x60]\n"
"ldr x23, [x8, #0x68]\n"
"fmla z20.s, p3/M, z1.s, z12.s\n"
- "fmla z13.s, p3/M, z8.s, z12.s\n"
- "incw x15\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "incw x14\n"
"mov p0.b, p2.b\n"
- "fmla z22.s, p3/M, z7.s, z12.s\n"
- "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
- "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "movprfx z9, z22\n fmla z9.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
"ldr x28, [x8, #0x88]\n"
- "fmla z14.s, p3/M, z7.s, z23.s\n"
- "fmla z9.s, p3/M, z6.s, z12.s\n"
- "ldr x12, [x14, #0x0]\n"
- "ldr x11, [x14, #0x8]\n"
- "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
- "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
- "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z27.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x17, #0x0]\n"
+ "ldr x11, [x17, #0x8]\n"
+ "movprfx z11, z22\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z23, z22\n fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x15, LSL #2]\n"
"ldr x22, [x8, #0x78]\n"
- "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
- "fmla z15.s, p3/M, z6.s, z23.s\n"
- "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "movprfx z25, z22\n fmla z25.s, p3/M, z8.s, z24.s\n"
+ "fmla z13.s, p3/M, z6.s, z27.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
"ldr x21, [x8, #0x80]\n"
- "fmla z30.s, p3/M, z4.s, z23.s\n"
- "fmla z20.s, p3/M, z3.s, z23.s\n"
- "ldr x10, [x14, #0x10]\n"
- "ldr x9, [x14, #0x18]\n"
- "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
- "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
- "fmla z27.s, p3/M, z8.s, z23.s\n"
- "fmla z31.s, p3/M, z5.s, z23.s\n"
- "fmla z28.s, p3/M, z2.s, z23.s\n"
- "fmla z18.s, p3/M, z1.s, z12.s\n"
- "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z4.s, z27.s\n"
+ "fmla z20.s, p3/M, z3.s, z27.s\n"
+ "ldr x10, [x17, #0x10]\n"
+ "ldr x9, [x17, #0x18]\n"
+ "movprfx z24, z22\n fmla z24.s, p3/M, z1.s, z27.s\n"
+ "movprfx z12, z22\n fmla z12.s, p3/M, z0.s, z27.s\n"
+ "fmla z31.s, p3/M, z8.s, z27.s\n"
+ "fmla z28.s, p3/M, z5.s, z27.s\n"
+ "fmla z9.s, p3/M, z2.s, z27.s\n"
+ "fmla z30.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z27.s }, p2/Z, [x25, x15, LSL #2]\n"
"ldr x27, [x8, #0x90]\n"
- "fmla z13.s, p3/M, z0.s, z12.s\n"
- "fmla z22.s, p3/M, z2.s, z21.s\n"
- "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z0.s, z21.s\n"
+ "fmla z26.s, p3/M, z2.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x20, x15, LSL #2]\n"
"ldr x20, [x8, #0x98]\n"
- "fmla z14.s, p3/M, z8.s, z29.s\n"
- "fmla z9.s, p3/M, z1.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x26, [x8, #0xa0]\n"
- "fmla z15.s, p3/M, z7.s, z29.s\n"
- "fmla z11.s, p3/M, z6.s, z29.s\n"
- "fmla z30.s, p3/M, z5.s, z29.s\n"
- "fmla z20.s, p3/M, z4.s, z29.s\n"
- "fmla z10.s, p3/M, z3.s, z29.s\n"
- "fmla z25.s, p3/M, z2.s, z29.s\n"
- "fmla z24.s, p3/M, z1.s, z29.s\n"
- "fmla z26.s, p3/M, z0.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z7.s, z10.s\n"
+ "fmla z11.s, p3/M, z6.s, z10.s\n"
+ "fmla z15.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z12.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z22.s }, p2/Z, [x23, x15, LSL #2]\n"
"ldr x25, [x8, #0xa8]\n"
- "fmla z18.s, p3/M, z3.s, z23.s\n"
- "fmla z27.s, p3/M, z0.s, z23.s\n"
- "fmla z31.s, p3/M, z6.s, z21.s\n"
- "fmla z28.s, p3/M, z3.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z27.s\n"
+ "fmla z31.s, p3/M, z0.s, z27.s\n"
+ "fmla z28.s, p3/M, z6.s, z17.s\n"
+ "fmla z9.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x13, x15, LSL #2]\n"
"ldr x24, [x8, #0xb0]\n"
- "fmla z13.s, p3/M, z4.s, z29.s\n"
- "fmla z22.s, p3/M, z3.s, z29.s\n"
- "fmla z14.s, p3/M, z1.s, z29.s\n"
- "fmla z9.s, p3/M, z5.s, z12.s\n"
- "fmla z11.s, p3/M, z2.s, z12.s\n"
- "fmla z15.s, p3/M, z0.s, z29.s\n"
- "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z22.s\n"
+ "fmla z26.s, p3/M, z3.s, z22.s\n"
+ "fmla z16.s, p3/M, z1.s, z22.s\n"
+ "fmla z29.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z2.s, z21.s\n"
+ "fmla z13.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x15, LSL #2]\n"
"ldr x23, [x8, #0xb8]\n"
- "fmla z10.s, p3/M, z8.s, z21.s\n"
- "fmla z26.s, p3/M, z5.s, z21.s\n"
- "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z17.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x15, LSL #2]\n"
"ldr x22, [x8, #0xc0]\n"
- "fmla z18.s, p3/M, z5.s, z29.s\n"
- "fmla z27.s, p3/M, z2.s, z29.s\n"
- "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z5.s, z22.s\n"
+ "fmla z31.s, p3/M, z2.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x28, x15, LSL #2]\n"
"ldr x21, [x8, #0xc8]\n"
- "fmla z13.s, p3/M, z5.s, z17.s\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z14.s, p3/M, z2.s, z17.s\n"
- "fmla z9.s, p3/M, z3.s, z17.s\n"
- "fmla z15.s, p3/M, z1.s, z17.s\n"
- "fmla z11.s, p3/M, z0.s, z17.s\n"
- "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z5.s, z21.s\n"
+ "fmla z26.s, p3/M, z4.s, z21.s\n"
+ "fmla z16.s, p3/M, z2.s, z21.s\n"
+ "fmla z29.s, p3/M, z3.s, z21.s\n"
+ "fmla z13.s, p3/M, z1.s, z21.s\n"
+ "fmla z11.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x20, x15, LSL #2]\n"
"ldr x28, [x8, #0xd8]\n"
- "fmla z28.s, p3/M, z7.s, z23.s\n"
- "fmla z25.s, p3/M, z6.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z9.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x15, LSL #2]\n"
"ldr x20, [x8, #0xd0]\n"
- "fmla z18.s, p3/M, z7.s, z21.s\n"
- "fmla z13.s, p3/M, z6.s, z21.s\n"
- "fmla z27.s, p3/M, z4.s, z21.s\n"
- "fmla z14.s, p3/M, z3.s, z21.s\n"
- "fmla z31.s, p3/M, z1.s, z21.s\n"
- "fmla z30.s, p3/M, z0.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z22.s\n"
+ "fmla z18.s, p3/M, z6.s, z22.s\n"
+ "fmla z31.s, p3/M, z4.s, z22.s\n"
+ "fmla z16.s, p3/M, z3.s, z22.s\n"
+ "fmla z28.s, p3/M, z1.s, z22.s\n"
+ "fmla z15.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x26, x15, LSL #2]\n"
"ldr x27, [x8, #0xe0]\n"
- "fmla z22.s, p3/M, z8.s, z29.s\n"
- "fmla z24.s, p3/M, z8.s, z23.s\n"
- "fmla z26.s, p3/M, z7.s, z23.s\n"
- "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
- "fmla z10.s, p3/M, z1.s, z29.s\n"
+ "fmla z26.s, p3/M, z8.s, z21.s\n"
+ "fmla z12.s, p3/M, z8.s, z17.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z1.s, z21.s\n"
"ldr x26, [x8, #0xe8]\n"
- "fmla z9.s, p3/M, z7.s, z29.s\n"
- "fmla z15.s, p3/M, z5.s, z29.s\n"
- "fmla z11.s, p3/M, z4.s, z29.s\n"
- "fmla z20.s, p3/M, z2.s, z29.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z4.s, z21.s\n"
+ "fmla z20.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x15, LSL #2]\n"
"ldr x25, [x8, #0xf0]\n"
- "fmla z18.s, p3/M, z2.s, z21.s\n"
- "fmla z13.s, p3/M, z1.s, z21.s\n"
- "fmla z22.s, p3/M, z0.s, z21.s\n"
- "fmla z27.s, p3/M, z7.s, z23.s\n"
- "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z22.s\n"
+ "fmla z18.s, p3/M, z1.s, z22.s\n"
+ "fmla z26.s, p3/M, z0.s, z22.s\n"
+ "fmla z31.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z22.s }, p2/Z, [x23, x15, LSL #2]\n"
"ldr x24, [x8, #0xf8]\n"
- "fmla z14.s, p3/M, z6.s, z23.s\n"
- "fmla z31.s, p3/M, z4.s, z23.s\n"
- "fmla z30.s, p3/M, z3.s, z23.s\n"
- "fmla z28.s, p3/M, z1.s, z23.s\n"
- "fmla z25.s, p3/M, z0.s, z23.s\n"
- "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
- "fmla z10.s, p3/M, z4.s, z17.s\n"
+ "fmla z16.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z17.s\n"
+ "fmla z15.s, p3/M, z3.s, z17.s\n"
+ "fmla z9.s, p3/M, z1.s, z17.s\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z21.s\n"
"ldr x23, [x8, #0x100]\n"
- "fmla z24.s, p3/M, z2.s, z17.s\n"
- "fmla z13.s, p3/M, z2.s, z29.s\n"
- "fmla z22.s, p3/M, z1.s, z29.s\n"
- "fmla z9.s, p3/M, z0.s, z29.s\n"
- "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z21.s\n"
+ "fmla z29.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
"ldr x22, [x8, #0x108]\n"
- "fmla z18.s, p3/M, z6.s, z21.s\n"
- "fmla z27.s, p3/M, z3.s, z21.s\n"
- "fmla z31.s, p3/M, z0.s, z21.s\n"
- "fmla z15.s, p3/M, z8.s, z17.s\n"
- "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z22.s\n"
+ "fmla z31.s, p3/M, z3.s, z22.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "fmla z12.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z0.s, z22.s\n"
+ "fmla z13.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z22.s }, p2/Z, [x20, x15, LSL #2]\n"
"ldr x21, [x8, #0x110]\n"
"fmla z11.s, p3/M, z7.s, z17.s\n"
"fmla z20.s, p3/M, z5.s, z17.s\n"
- "fmla z26.s, p3/M, z1.s, z17.s\n"
- "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
- "fmla z10.s, p3/M, z2.s, z23.s\n"
+ "fmla z25.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z21.s\n"
"ldr x20, [x8, #0x118]\n"
- "fmla z28.s, p3/M, z0.s, z29.s\n"
- "fmla z25.s, p3/M, z4.s, z21.s\n"
- "fmla z24.s, p3/M, z3.s, z21.s\n"
- "fmla z9.s, p3/M, z8.s, z23.s\n"
- "fmla z11.s, p3/M, z5.s, z23.s\n"
- "fmla z27.s, p3/M, z6.s, z29.s\n"
- "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
- "fmla z31.s, p3/M, z3.s, z29.s\n"
- "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
- "fmla z30.s, p3/M, z7.s, z21.s\n"
- "fmla z20.s, p3/M, z6.s, z21.s\n"
- "fmla z28.s, p3/M, z5.s, z21.s\n"
- "fmla z10.s, p3/M, z5.s, z23.s\n"
- "fmla z26.s, p3/M, z2.s, z23.s\n"
- "fmla z25.s, p3/M, z7.s, z17.s\n"
- "fmla z24.s, p3/M, z6.s, z17.s\n"
- "fmla z31.s, p3/M, z8.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z21.s\n"
+ "fmla z9.s, p3/M, z0.s, z22.s\n"
+ "fmla z31.s, p3/M, z6.s, z22.s\n"
+ "fmla z28.s, p3/M, z3.s, z22.s\n"
+ "ld1w { z27.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z17.s\n"
+ "fmla z12.s, p3/M, z3.s, z17.s\n"
+ "fmla z11.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z15.s, p3/M, z7.s, z17.s\n"
+ "fmla z20.s, p3/M, z6.s, z17.s\n"
+ "fmla z9.s, p3/M, z5.s, z17.s\n"
"fmla z28.s, p3/M, z8.s, z17.s\n"
- "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
- "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z22.s\n"
+ "fmla z25.s, p3/M, z2.s, z22.s\n"
+ "fmla z24.s, p3/M, z7.s, z27.s\n"
+ "fmla z12.s, p3/M, z6.s, z27.s\n"
+ "fmla z11.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z9.s, p3/M, z8.s, z27.s\n"
+ "fmla z15.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z27.s }, p2/Z, [x23, x15, LSL #2]\n"
"fmla z20.s, p3/M, z7.s, z21.s\n"
- "fmla z10.s, p3/M, z6.s, z21.s\n"
- "fmla z25.s, p3/M, z5.s, z21.s\n"
- "fmla z24.s, p3/M, z4.s, z21.s\n"
- "fmla z26.s, p3/M, z3.s, z21.s\n"
- "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
- "fmla z11.s, p3/M, z8.s, z23.s\n"
- "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
- "fmla z18.s, p3/M, z4.s, z12.s\n"
- "fmax z18.s, p3/M, z18.s, z16.s\n"
- "fmla z13.s, p3/M, z3.s, z12.s\n"
- "fmla z22.s, p3/M, z5.s, z21.s\n"
- "fmax z13.s, p3/M, z13.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z16.s\n"
- "fmla z9.s, p3/M, z4.s, z21.s\n"
- "fmla z25.s, p3/M, z8.s, z29.s\n"
- "fmax z9.s, p3/M, z9.s, z16.s\n"
- "fmin z18.s, p3/M, z18.s, z19.s\n"
- "fmla z24.s, p3/M, z7.s, z29.s\n"
- "fmla z26.s, p3/M, z6.s, z29.s\n"
- "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
- "fmin z13.s, p3/M, z13.s, z19.s\n"
- "fmla z27.s, p3/M, z1.s, z12.s\n"
- "fmla z14.s, p3/M, z0.s, z12.s\n"
- "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
- "fmin z22.s, p3/M, z22.s, z19.s\n"
- "fmla z15.s, p3/M, z2.s, z21.s\n"
+ "fmla z23.s, p3/M, z6.s, z21.s\n"
+ "fmla z24.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z3.s, z21.s\n"
+ "fmla z12.s, p3/M, z4.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z27.s\n"
+ "fmla z18.s, p3/M, z3.s, z27.s\n"
+ "fmla z31.s, p3/M, z1.s, z27.s\n"
+ "fmla z16.s, p3/M, z0.s, z27.s\n"
+ "ld1w { z27.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "fmla z29.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z12.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmax z30.s, p3/M, z30.s, z19.s\n"
+ "fmla z13.s, p3/M, z2.s, z21.s\n"
"fmla z11.s, p3/M, z1.s, z21.s\n"
- "fmin z9.s, p3/M, z9.s, z19.s\n"
- "fmax z27.s, p3/M, z27.s, z16.s\n"
- "fmla z31.s, p3/M, z7.s, z23.s\n"
- "fmla z30.s, p3/M, z6.s, z23.s\n"
- "fmax z14.s, p3/M, z14.s, z16.s\n"
- "fmax z15.s, p3/M, z15.s, z16.s\n"
- "fmla z20.s, p3/M, z8.s, z29.s\n"
- "fmla z10.s, p3/M, z7.s, z29.s\n"
- "fmax z11.s, p3/M, z11.s, z16.s\n"
- "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
- "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
- "ldr x23, [x14, #0x20]\n"
- "ldr x22, [x14, #0x28]\n"
- "fmla z28.s, p3/M, z4.s, z23.s\n"
- "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
- "ldr x21, [x14, #0x30]\n"
- "fmla z25.s, p3/M, z3.s, z23.s\n"
- "fmla z24.s, p3/M, z5.s, z29.s\n"
- "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
- "ldr x20, [x14, #0x38]\n"
- "fmla z26.s, p3/M, z4.s, z29.s\n"
- "fmin z27.s, p3/M, z27.s, z19.s\n"
- "fmin z14.s, p3/M, z14.s, z19.s\n"
- "fmin z15.s, p3/M, z15.s, z19.s\n"
- "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
- "ldr x23, [x14, #0x40]\n"
- "fmin z11.s, p3/M, z11.s, z19.s\n"
- "fmax z31.s, p3/M, z31.s, z16.s\n"
- "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
- "ldr x22, [x14, #0x48]\n"
- "fmax z30.s, p3/M, z30.s, z16.s\n"
- "fmax z20.s, p3/M, z20.s, z16.s\n"
- "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
- "ldr x21, [x14, #0x50]\n"
- "fmax z10.s, p3/M, z10.s, z16.s\n"
- "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
- "ldr x20, [x14, #0x58]\n"
- "fmin z31.s, p3/M, z31.s, z19.s\n"
- "fmin z30.s, p3/M, z30.s, z19.s\n"
- "fmin z20.s, p3/M, z20.s, z19.s\n"
- "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
- "ldr x23, [x14, #0x60]\n"
- "fmin z10.s, p3/M, z10.s, z19.s\n"
- "fmax z28.s, p3/M, z28.s, z16.s\n"
- "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
- "ldr x22, [x14, #0x68]\n"
- "fmax z25.s, p3/M, z25.s, z16.s\n"
- "fmax z24.s, p3/M, z24.s, z16.s\n"
- "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
- "ldr x21, [x14, #0x70]\n"
- "fmax z26.s, p3/M, z26.s, z16.s\n"
- "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
- "ldr x20, [x14, #0x78]\n"
- "fmin z28.s, p3/M, z28.s, z19.s\n"
- "fmin z25.s, p3/M, z25.s, z19.s\n"
- "fmin z24.s, p3/M, z24.s, z19.s\n"
- "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
- "fmin z26.s, p3/M, z26.s, z19.s\n"
- "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
- "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
- "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
+ "fmax z18.s, p3/M, z18.s, z19.s\n"
+ "fmax z31.s, p3/M, z31.s, z19.s\n"
+ "fmax z26.s, p3/M, z26.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z27.s\n"
+ "fmla z23.s, p3/M, z7.s, z27.s\n"
+ "fmax z16.s, p3/M, z16.s, z19.s\n"
+ "fmax z29.s, p3/M, z29.s, z19.s\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "fmla z15.s, p3/M, z6.s, z10.s\n"
+ "fmin z30.s, p3/M, z30.s, z14.s\n"
+ "fmin z18.s, p3/M, z18.s, z14.s\n"
+ "fmla z9.s, p3/M, z4.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmin z31.s, p3/M, z31.s, z14.s\n"
+ "fmin z26.s, p3/M, z26.s, z14.s\n"
+ "fmax z13.s, p3/M, z13.s, z19.s\n"
+ "fmla z12.s, p3/M, z5.s, z27.s\n"
+ "fmla z25.s, p3/M, z4.s, z27.s\n"
+ "fmin z29.s, p3/M, z29.s, z14.s\n"
+ "fmax z11.s, p3/M, z11.s, z19.s\n"
+ "st1w { z30.s }, p0, [x12, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x20]\n"
+ "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x28]\n"
+ "fmin z16.s, p3/M, z16.s, z14.s\n"
+ "fmax z28.s, p3/M, z28.s, z19.s\n"
+ "st1w { z26.s }, p0, [x10, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x30]\n"
+ "fmin z13.s, p3/M, z13.s, z14.s\n"
+ "fmax z15.s, p3/M, z15.s, z19.s\n"
+ "st1w { z29.s }, p0, [x9, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x38]\n"
+ "fmin z11.s, p3/M, z11.s, z14.s\n"
+ "fmax z20.s, p3/M, z20.s, z19.s\n"
+ "fmax z23.s, p3/M, z23.s, z19.s\n"
+ "st1w { z31.s }, p0, [x23, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x40]\n"
+ "fmin z28.s, p3/M, z28.s, z14.s\n"
+ "st1w { z16.s }, p0, [x22, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x48]\n"
+ "fmin z15.s, p3/M, z15.s, z14.s\n"
+ "fmax z9.s, p3/M, z9.s, z19.s\n"
+ "st1w { z13.s }, p0, [x21, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x50]\n"
+ "fmin z20.s, p3/M, z20.s, z14.s\n"
+ "fmax z24.s, p3/M, z24.s, z19.s\n"
+ "st1w { z11.s }, p0, [x20, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "fmin z23.s, p3/M, z23.s, z14.s\n"
+ "fmax z12.s, p3/M, z12.s, z19.s\n"
+ "fmax z25.s, p3/M, z25.s, z19.s\n"
+ "st1w { z28.s }, p0, [x23, x14, LSL #2]\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmin z9.s, p3/M, z9.s, z14.s\n"
+ "st1w { z15.s }, p0, [x22, x14, LSL #2]\n"
+ "ldr x22, [x17, #0x68]\n"
+ "fmin z24.s, p3/M, z24.s, z14.s\n"
+ "st1w { z20.s }, p0, [x21, x14, LSL #2]\n"
+ "ldr x21, [x17, #0x70]\n"
+ "fmin z12.s, p3/M, z12.s, z14.s\n"
+ "st1w { z23.s }, p0, [x20, x14, LSL #2]\n"
+ "ldr x20, [x17, #0x78]\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ "st1w { z9.s }, p0, [x23, x14, LSL #2]\n"
+ "st1w { z24.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z12.s }, p0, [x21, x14, LSL #2]\n"
+ "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
: "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
index e6090fda94..d17c63f7ae 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,246 +88,246 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x11, #0x0\n"
- "mov x16, #0x0\n"
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
"1:" // Tile loop
- "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
- "mov x25, #0x4\n"
- "mov x24, #0x2\n"
- "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
- "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "cntw x13\n"
- "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
- "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x10, x15, x15\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x12, x12, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "add x28, x12, x23, LSL #2\n"
- "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x25, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntw x16\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
"whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z30.s }, p3/Z, [x11]\n"
- "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
- "add x27, x28, x23, LSL #2\n"
- "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
- "add x26, x10, x15\n"
- "add x25, x27, x23, LSL #2\n"
- "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
- "add x24, x26, x15\n"
- "add x9, x9, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "cmp x13, %x[n_channels]\n"
- "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x23, x25, x23, LSL #2\n"
- "add x22, x9, x21, LSL #2\n"
- "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
- "mov x21, #0x0\n"
- "sub x20, XZR, x13\n"
- "ld1w { z9.s }, p2/Z, [x27, x10, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x12]\n"
- "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x12, x26, LSL #2]\n"
- "addvl x11, x11, #-6\n"
- "ld1w { z13.s }, p2/Z, [x12, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x28]\n"
- "ld1w { z15.s }, p2/Z, [x28, x15, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x22, x7, x24\n" // offset = tile_i * ld_input_row
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x17, x17\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x21, x7, x23\n" // offset = tile_i * ld_output_row
+ "add x9, x10, x17\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x20, XZR, x16\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ld1w { z28.s }, p3/Z, [x12]\n"
+ "ld1w { z0.s }, p3/Z, [x12, #1, MUL VL]\n"
+ "add x28, x9, x17\n"
+ "ld1w { z1.s }, p3/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x12, #3, MUL VL]\n"
+ "madd x21, x8, x15, x21\n" // offset += tile_j * ld_output_col
+ "ld1w { z3.s }, p3/Z, [x12, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x12, #5, MUL VL]\n"
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "ld1w { z5.s }, p3/Z, [x12, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x12, #7, MUL VL]\n"
+ "addvl x12, x12, #16\n"
+ "mul x21, x21, x25\n" // offset *= output_tile_size
+ "add x13, x13, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x27, x13, x24, LSL #2\n"
+ "add x26, x27, x24, LSL #2\n"
+ "ld1w { z10.s }, p2/Z, [x13]\n"
+ "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n"
+ "add x25, x26, x24, LSL #2\n"
+ "add x11, x11, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x24, x25, x24, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x12, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x12, #-7, MUL VL]\n"
+ "add x23, x11, x23, LSL #2\n"
+ "ld1w { z9.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x13, x9, LSL #2]\n"
+ "addvl x12, x12, #-6\n"
+ "ld1w { z13.s }, p2/Z, [x13, x28, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x27]\n"
+ "ld1w { z15.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x13, x10, LSL #2]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
- "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
- "whilelt p1.s, x13, %x[n_channels]\n"
- "incw x21\n"
+ "movprfx z27, z28\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z28\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
+ "incw x14\n"
+ "movprfx z25, z28\n fmla z25.s, p3/M, z2.s, z9.s\n"
+ "movprfx z24, z28\n fmla z24.s, p3/M, z0.s, z9.s\n"
+ "incw x16\n"
+ "mov p0.b, p2.b\n"
+ "addvl x13, x13, #1\n"
+ "ld1w { z28.s }, p3/Z, [x12]\n"
+ "incw x20\n"
"fmla z27.s, p3/M, z0.s, z10.s\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
- "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
- "incw x13\n"
+ "ld1w { z21.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x13]\n"
"fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "addvl x27, x27, #1\n"
"fmla z27.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x25]\n"
"fmla z26.s, p3/M, z0.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x25]\n"
- "mov p0.b, p2.b\n"
+ "fmla z25.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x28, LSL #2]\n"
"fmla z27.s, p3/M, z4.s, z15.s\n"
- "fmla z26.s, p3/M, z4.s, z17.s\n"
- "ld1w { z25.s }, p2/Z, [x27]\n"
- "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x26]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x17, LSL #2]\n"
"fmla z27.s, p3/M, z2.s, z16.s\n"
- "fmla z26.s, p3/M, z5.s, z20.s\n"
- "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "addvl x12, x12, #1\n"
- "addvl x28, x28, #1\n"
- "fmla z27.s, p3/M, z5.s, z19.s\n"
- "fmla z26.s, p3/M, z3.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x11]\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "fmla z21.s, p3/M, z4.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z25.s\n"
- "fmla z21.s, p3/M, z1.s, z24.s\n"
- "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
- "incw x20\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z21.s, p3/M, z5.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z25.s\n"
- "fmla z22.s, p3/M, z1.s, z23.s\n"
- "ld1w { z17.s }, p2/Z, [x23]\n"
- "addvl x27, x27, #1\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z0.s }, p3/Z, [x12, #1, MUL VL]\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z20.s\n"
+ "fmla z26.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "ld1w { z4.s }, p3/Z, [x12, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z17.s }, p2/Z, [x24]\n"
+ "fmla z26.s, p3/M, z7.s, z18.s\n"
+ "fmla z24.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z12.s }, p1/Z, [x13, x9, LSL #2]\n"
+ "fmla z25.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z1.s }, p3/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z9.s }, p1/Z, [x26, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
"ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z29.s\n"
- "fmla z22.s, p3/M, z6.s, z17.s\n"
- "fmla z21.s, p3/M, z3.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
- "fmla z22.s, p3/M, z7.s, z20.s\n"
- "fmla z21.s, p3/M, z7.s, z18.s\n"
- "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
- "fmla z26.s, p3/M, z7.s, z24.s\n"
- "fmla z22.s, p3/M, z5.s, z16.s\n"
- "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
- "fmla z21.s, p3/M, z6.s, z17.s\n"
- "fmla z26.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z29.s\n"
- "fmla z22.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z29.s\n"
- "fmax z21.s, p3/M, z21.s, z29.s\n"
- "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
- "cmp x13, %x[n_channels]\n"
- "fmin z27.s, p3/M, z27.s, z28.s\n"
- "ld1w { z10.s }, p1/Z, [x12]\n"
- "ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
- "fmin z26.s, p3/M, z26.s, z28.s\n"
- "fmin z22.s, p3/M, z22.s, z28.s\n"
- "ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z28.s\n"
"addvl x25, x25, #1\n"
- "ld1w { z14.s }, p1/Z, [x28]\n"
- "ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z23.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z20.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z30.s\n"
+ "fmla z24.s, p3/M, z2.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x12, #3, MUL VL]\n"
+ "whilelt p2.s, x14, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
+ "fmin z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z25.s, p3/M, z7.s, z21.s\n"
+ "ld1w { z13.s }, p1/Z, [x13, x28, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z30.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z3.s }, p3/Z, [x12, #4, MUL VL]\n"
+ "fmin z26.s, p3/M, z26.s, z29.s\n"
+ "st1w { z27.s }, p0, [x11]\n"
+ "fmla z25.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z5.s }, p3/Z, [x12, #6, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [x13, x10, LSL #2]\n"
+ "st1w { z26.s }, p0, [x11, x15, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z14.s }, p1/Z, [x27]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z6.s }, p3/Z, [x12, #7, MUL VL]\n"
+ "addvl x12, x12, #16\n"
+ "ld1w { z15.s }, p1/Z, [x27, x17, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z30.s\n"
+ "ld1w { z7.s }, p3/Z, [x12, #-8, MUL VL]\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z11.s }, p1/Z, [x13, x17, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x12, #-7, MUL VL]\n"
+ "addvl x12, x12, #-6\n"
+ "fmin z25.s, p3/M, z25.s, z29.s\n"
+ "fmax z24.s, p3/M, z24.s, z30.s\n"
+ "st1w { z25.s }, p0, [x23]\n"
+ "fmin z24.s, p3/M, z24.s, z29.s\n"
+ "st1w { z24.s }, p0, [x23, x15, LSL #2]\n"
"addvl x23, x23, #1\n"
- "ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
- "st1w { z27.s }, p0, [x9]\n"
- "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
- "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
- "addvl x9, x9, #1\n"
- "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
- "addvl x11, x11, #-6\n"
- "st1w { z22.s }, p0, [x22]\n"
- "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
- "addvl x22, x22, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
- "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
- "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z27, z28\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z28\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z25, z28\n fmla z25.s, p3/M, z2.s, z9.s\n"
+ "movprfx z24, z28\n fmla z24.s, p3/M, z0.s, z9.s\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x8, x8, #0x1\n"
+ "add x20, x7, #0x1\n"
"fmla z27.s, p3/M, z0.s, z10.s\n"
"fmla z26.s, p3/M, z1.s, z12.s\n"
- "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ld1w { z21.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "cmp x8, x22\n"
+ "csel x7, x7, x20, LT\n"
+ "csel x8, x8, XZR, LT\n"
"fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
"fmla z26.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "cmp x7, x21\n"
"fmla z27.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x25]\n"
"fmla z26.s, p3/M, z0.s, z16.s\n"
- "ld1w { z18.s }, p2/Z, [x25]\n"
- "add x16, x16, #0x1\n"
+ "fmla z25.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x28, LSL #2]\n"
"fmla z27.s, p3/M, z4.s, z15.s\n"
- "fmla z26.s, p3/M, z4.s, z17.s\n"
- "ld1w { z25.s }, p2/Z, [x27]\n"
- "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x26]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "fmla z25.s, p3/M, z0.s, z22.s\n"
"fmla z27.s, p3/M, z2.s, z16.s\n"
- "fmla z26.s, p3/M, z5.s, z20.s\n"
- "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
- "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
- "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "cmp x16, x20\n"
- "add x21, x11, #0x1\n"
- "fmla z27.s, p3/M, z5.s, z19.s\n"
- "fmla z26.s, p3/M, z3.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
- "fmla z22.s, p3/M, z3.s, z18.s\n"
- "fmla z21.s, p3/M, z4.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
- "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z25.s\n"
- "fmla z21.s, p3/M, z1.s, z24.s\n"
- "csel x11, x11, x21, LT\n"
- "mov p0.b, p2.b\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z21.s, p3/M, z5.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
- "fmla z27.s, p3/M, z6.s, z25.s\n"
- "fmla z22.s, p3/M, z1.s, z23.s\n"
- "ld1w { z17.s }, p2/Z, [x23]\n"
- "csel x16, x16, XZR, LT\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "fmla z27.s, p3/M, z7.s, z23.s\n"
- "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
- "fmax z27.s, p3/M, z27.s, z29.s\n"
- "fmla z22.s, p3/M, z6.s, z17.s\n"
- "fmla z21.s, p3/M, z3.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
- "cmp x11, x20\n"
- "fmla z22.s, p3/M, z7.s, z20.s\n"
- "fmla z21.s, p3/M, z7.s, z18.s\n"
- "fmin z27.s, p3/M, z27.s, z28.s\n"
- "st1w { z27.s }, p0, [x9]\n"
- "fmla z26.s, p3/M, z7.s, z24.s\n"
- "fmla z22.s, p3/M, z5.s, z16.s\n"
- "fmla z21.s, p3/M, z6.s, z17.s\n"
- "fmla z26.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
- "fmax z26.s, p3/M, z26.s, z29.s\n"
- "fmla z22.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "fmax z22.s, p3/M, z22.s, z29.s\n"
- "fmax z21.s, p3/M, z21.s, z29.s\n"
- "fmin z26.s, p3/M, z26.s, z28.s\n"
- "fmin z22.s, p3/M, z22.s, z28.s\n"
- "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
- "fmin z21.s, p3/M, z21.s, z28.s\n"
- "st1w { z22.s }, p0, [x22]\n"
- "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z20.s\n"
+ "fmla z25.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z20.s }, p2/Z, [x24, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x24]\n"
+ "fmla z25.s, p3/M, z1.s, z18.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z21.s\n"
+ "fmla z24.s, p3/M, z5.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z30.s\n"
+ "fmla z25.s, p3/M, z7.s, z20.s\n"
+ "fmax z26.s, p3/M, z26.s, z30.s\n"
+ "fmin z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z24.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmin z26.s, p3/M, z26.s, z29.s\n"
+ "st1w { z27.s }, p0, [x11]\n"
+ "fmla z24.s, p3/M, z3.s, z19.s\n"
+ "st1w { z26.s }, p0, [x11, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmax z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z25.s, p3/M, z25.s, z29.s\n"
+ "st1w { z25.s }, p0, [x23]\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z29.s\n"
+ "st1w { z24.s }, p0, [x23, x15, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
index 98427701fa..0a4929918b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,245 +89,245 @@ void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
- "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "cntw x14\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cntw x15\n"
+ "mov x14, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
"ldp x13, x12, [x20, #0x0]\n"
"ldp x11, x10, [x20, #0x10]\n"
- "mov x9, #0x0\n"
- "whilelt p2.s, XZR, %x[n_channels]\n"
- "ld1w { z20.s }, p3/Z, [x16]\n"
- "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
- "sub x28, XZR, x14\n"
- "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
- "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x17]\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "cmp x15, %x[n_channels]\n"
+ "sub x9, XZR, x15\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x14, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
- "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x15, #0x50]\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z14.s\n"
- "fmla z23.s, p3/M, z0.s, z16.s\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla z24.s, p3/M, z4.s, z15.s\n"
- "fmla z23.s, p3/M, z4.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x60]\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
- "fmla z23.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x15, #0x80]\n"
- "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
- "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x68]\n"
- "fmla z24.s, p3/M, z5.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x88]\n"
- "fmla z22.s, p3/M, z3.s, z17.s\n"
- "fmla z21.s, p3/M, z4.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z18.s\n"
- "fmla z21.s, p3/M, z1.s, z20.s\n"
- "ldr x21, [x15, #0x70]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z21.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "fmla z22.s, p3/M, z1.s, z16.s\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "fmla z24.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0xa0]\n"
- "ldr x20, [x15, #0xb0]\n"
- "fmla z22.s, p3/M, z6.s, z16.s\n"
- "fmla z21.s, p3/M, z3.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z17.s\n"
- "fmla z21.s, p3/M, z7.s, z16.s\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x15, #0xc0]\n"
- "fmla z21.s, p3/M, z6.s, z17.s\n"
- "fmla z23.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "whilelt p1.s, x14, %x[n_channels]\n"
- "ldp x27, x26, [x15, #0x0]\n"
- "ldp x25, x24, [x15, #0x10]\n"
- "ldp x23, x22, [x15, #0x20]\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ldr x28, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "movprfx z25, z30\n fmla z25.s, p3/M, z2.s, z9.s\n"
+ "movprfx z24, z30\n fmla z24.s, p3/M, z0.s, z9.s\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "whilelt p1.s, x15, %x[n_channels]\n"
"incw x9\n"
- "fmax z24.s, p3/M, z24.s, z26.s\n"
- "ldp x21, x20, [x15, #0x30]\n"
- "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
- "fmax z23.s, p3/M, z23.s, z26.s\n"
- "fmax z22.s, p3/M, z22.s, z26.s\n"
- "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
- "fmax z21.s, p3/M, z21.s, z26.s\n"
- "incw x28\n"
- "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ldr x26, [x16, #0x70]\n"
"mov p0.b, p2.b\n"
- "whilelt p2.s, x9, %x[n_channels]\n"
- "ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
- "ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
- "fmin z24.s, p3/M, z24.s, z25.s\n"
- "fmin z23.s, p3/M, z23.s, z25.s\n"
- "ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x22, [x16, #0x88]\n"
+ "ld1w { z30.s }, p3/Z, [x17]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "fmla z25.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z23.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z25.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z20.s\n"
+ "fmla z26.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z24.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z20.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z1.s, z19.s\n"
+ "fmla z24.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "fmla z26.s, p3/M, z7.s, z18.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z23.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z21.s\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z24.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
"incw x14\n"
- "ld1w { z20.s }, p3/Z, [x16]\n"
- "cmp x14, %x[n_channels]\n"
- "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
- "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
- "fmin z22.s, p3/M, z22.s, z25.s\n"
- "fmin z21.s, p3/M, z21.s, z25.s\n"
- "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
- "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
- "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
- "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
- "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
- "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
- "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
- "addvl x16, x16, #16\n"
- "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
- "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
- "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
- "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
- "addvl x16, x16, #-6\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "fmla z25.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "whilelt p2.s, x14, %x[n_channels]\n"
+ "ld1w { z12.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "st1w { z27.s }, p0, [x13, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z14.s }, p1/Z, [x22, x15, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z15.s }, p1/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "st1w { z26.s }, p0, [x12, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z11.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "incw x15\n"
+ "fmax z25.s, p3/M, z25.s, z29.s\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "addvl x17, x17, #-6\n"
+ "cmp x15, %x[n_channels]\n"
+ "fmin z25.s, p3/M, z25.s, z28.s\n"
+ "fmax z24.s, p3/M, z24.s, z29.s\n"
+ "fmin z24.s, p3/M, z24.s, z28.s\n"
+ "st1w { z25.s }, p0, [x11, x9, LSL #2]\n"
+ "st1w { z24.s }, p0, [x10, x9, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
- "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
- "ldr x21, [x15, #0x40]\n"
- "ldr x20, [x15, #0x48]\n"
- "fmla z24.s, p3/M, z0.s, z10.s\n"
- "fmla z23.s, p3/M, z1.s, z12.s\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x20, [x15, #0x50]\n"
- "fmla z24.s, p3/M, z1.s, z11.s\n"
- "fmla z23.s, p3/M, z2.s, z13.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z3.s, z14.s\n"
- "fmla z23.s, p3/M, z0.s, z16.s\n"
- "ldr x20, [x15, #0x58]\n"
- "ldr x22, [x15, #0x78]\n"
- "fmla z24.s, p3/M, z4.s, z15.s\n"
- "fmla z23.s, p3/M, z4.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x60]\n"
- "fmla z24.s, p3/M, z2.s, z16.s\n"
- "fmla z23.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x15, #0x80]\n"
- "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
- "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
- "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
- "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0x68]\n"
- "fmla z24.s, p3/M, z5.s, z19.s\n"
- "fmla z23.s, p3/M, z3.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
- "ldr x20, [x15, #0x88]\n"
- "fmla z22.s, p3/M, z3.s, z17.s\n"
- "fmla z21.s, p3/M, z4.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z0.s, z18.s\n"
- "fmla z21.s, p3/M, z1.s, z20.s\n"
- "ldr x21, [x15, #0x70]\n"
- "ldr x20, [x15, #0x98]\n"
- "fmla z22.s, p3/M, z4.s, z17.s\n"
- "fmla z21.s, p3/M, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z24.s, p3/M, z6.s, z18.s\n"
- "ldr x21, [x15, #0x90]\n"
- "ldr x20, [x15, #0xa8]\n"
- "fmla z22.s, p3/M, z1.s, z16.s\n"
- "fmla z21.s, p3/M, z2.s, z19.s\n"
- "fmla z24.s, p3/M, z7.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
- "ldr x21, [x15, #0xa0]\n"
- "ldr x20, [x15, #0xb0]\n"
- "fmla z22.s, p3/M, z6.s, z16.s\n"
- "fmla z21.s, p3/M, z3.s, z18.s\n"
- "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z7.s, z17.s\n"
- "fmla z21.s, p3/M, z7.s, z16.s\n"
- "ldr x20, [x15, #0xb8]\n"
- "fmla z23.s, p3/M, z7.s, z20.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z5.s, z18.s\n"
- "ldr x20, [x15, #0xc0]\n"
- "fmla z21.s, p3/M, z6.s, z17.s\n"
- "fmla z23.s, p3/M, z8.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
- "fmla z22.s, p3/M, z8.s, z17.s\n"
- "fmla z21.s, p3/M, z8.s, z16.s\n"
- "incw x28\n"
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ldr x28, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ldr x26, [x16, #0x50]\n"
+ "ldr x25, [x16, #0x58]\n"
+ "movprfx z25, z30\n fmla z25.s, p3/M, z2.s, z9.s\n"
+ "movprfx z24, z30\n fmla z24.s, p3/M, z0.s, z9.s\n"
+ "ldr x27, [x16, #0x78]\n"
+ "ldr x24, [x16, #0x60]\n"
+ "incw x9\n"
"mov p0.b, p2.b\n"
- "fmax z24.s, p3/M, z24.s, z26.s\n"
- "fmax z23.s, p3/M, z23.s, z26.s\n"
- "fmax z22.s, p3/M, z22.s, z26.s\n"
- "fmax z21.s, p3/M, z21.s, z26.s\n"
- "fmin z24.s, p3/M, z24.s, z25.s\n"
- "fmin z23.s, p3/M, z23.s, z25.s\n"
- "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
- "fmin z22.s, p3/M, z22.s, z25.s\n"
- "fmin z21.s, p3/M, z21.s, z25.s\n"
- "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
- "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "ldr x22, [x16, #0x70]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ldr x21, [x16, #0x88]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "ldr x26, [x16, #0x90]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "ldr x25, [x16, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "ld1w { z23.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "ldr x24, [x16, #0xa0]\n"
+ "fmla z25.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z22.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "ldr x23, [x16, #0xa8]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "ldr x22, [x16, #0xb0]\n"
+ "fmla z25.s, p3/M, z0.s, z23.s\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z27.s, p3/M, z5.s, z20.s\n"
+ "fmla z26.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x27, x14, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z24.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x14, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z1.s, z18.s\n"
+ "fmla z27.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x14, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z20.s\n"
+ "fmla z24.s, p3/M, z5.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x14, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z24.s, p3/M, z2.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z21.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmla z24.s, p3/M, z3.s, z19.s\n"
+ "st1w { z27.s }, p0, [x13, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x12, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z8.s, z16.s\n"
+ "fmax z25.s, p3/M, z25.s, z29.s\n"
+ "fmin z25.s, p3/M, z25.s, z28.s\n"
+ "st1w { z25.s }, p0, [x11, x9, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z29.s\n"
+ "fmin z24.s, p3/M, z24.s, z28.s\n"
+ "st1w { z24.s }, p0, [x10, x9, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
index 075181a488..e1cc33db1b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -88,432 +88,432 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
__asm__ __volatile__(
"ptrue p3.b\n"
- "mov x12, #0x0\n"
- "mov x8, #0x0\n"
+ "mov x6, #0x0\n"
+ "mov x7, #0x0\n"
"1:" // Tile loop
- "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
"mov x25, #0x2\n"
- "mov x24, #0x2\n"
- "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
- "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
- "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
- "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
- "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
- "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "cntw x17\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
"ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
- "add x15, x17, x17\n"
- "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mov x15, #0x0\n"
"ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
"ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
- "cntw x12\n"
- "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
- "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
- "add x11, x14, x23, LSL #2\n"
- "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
- "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
- "add x9, x11, x23, LSL #2\n"
- "add x28, x15, x17\n"
+ "mul x20, x6, x24\n" // offset = tile_i * ld_input_row
+ "add x12, x8, x8\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x12, x8\n"
+ "cmp x17, %x[n_channels]\n"
"ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "mul x20, x20, x24\n" // offset *= output_tile_size
- "whilelt p2.s, XZR, %x[n_channels]\n"
- "add x27, x9, x23, LSL #2\n"
+ "mul x22, x6, x23\n" // offset = tile_i * ld_output_row
+ "add x9, x10, x8\n"
"ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "add x26, x28, x17\n"
- "add x25, x27, x23, LSL #2\n"
- "ld1w { z29.s }, p3/Z, [x10]\n"
- "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
- "add x24, x26, x17\n"
- "add x13, x13, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
- "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
- "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
- "cmp x12, %x[n_channels]\n"
- "add x23, x25, x23, LSL #2\n"
- "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
- "add x22, x13, x21, LSL #2\n"
- "mov x21, #0x0\n"
+ "sub x21, XZR, x17\n"
+ "madd x20, x7, x8, x20\n" // offset += tile_j * ld_input_col
+ "add x28, x9, x8\n"
+ "ld1w { z29.s }, p3/Z, [x11]\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "madd x22, x7, x16, x22\n" // offset += tile_j * ld_output_col
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "addvl x11, x11, #6\n"
+ "mul x20, x20, x26\n" // offset *= kernel_stride * output_size
+ "mul x22, x22, x25\n" // offset *= output_tile_size
+ "add x14, x14, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x20, x14, x24, LSL #2\n"
+ "add x27, x20, x24, LSL #2\n"
"ld1w { z5.s }, p2/Z, [x14]\n"
- "ld1w { z6.s }, p2/Z, [x14, x17, LSL #2]\n"
- "sub x20, XZR, x12\n"
- "ld1w { z7.s }, p2/Z, [x11]\n"
- "ld1w { z8.s }, p2/Z, [x11, x17, LSL #2]\n"
- "addvl x10, x10, #6\n"
- "ld1w { z9.s }, p2/Z, [x14, x15, LSL #2]\n"
- "ld1w { z13.s }, p2/Z, [x11, x15, LSL #2]\n"
- "ld1w { z11.s }, p2/Z, [x14, x28, LSL #2]\n"
- "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
- "ld1w { z10.s }, p2/Z, [x11, x24, LSL #2]\n"
- "ld1w { z14.s }, p2/Z, [x9]\n"
+ "ld1w { z6.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "add x26, x27, x24, LSL #2\n"
+ "add x25, x26, x24, LSL #2\n"
+ "ld1w { z7.s }, p2/Z, [x20]\n"
+ "ld1w { z8.s }, p2/Z, [x20, x8, LSL #2]\n"
+ "add x13, x13, x22, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x24, x25, x24, LSL #2\n"
+ "add x23, x13, x23, LSL #2\n"
+ "ld1w { z9.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x12, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x10, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x27]\n"
"bge 3f\n"
"2:" // Tile loop: Channel loop
- "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
"movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
- "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
- "ld1w { z18.s }, p3/Z, [x10]\n"
+ "ld1w { z25.s }, p2/Z, [x20, x10, LSL #2]\n"
+ "whilelt p1.s, x17, %x[n_channels]\n"
+ "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z7.s\n"
+ "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z23.s }, p3/Z, [x11]\n"
+ "incw x15\n"
+ "incw x17\n"
+ "mov p0.b, p2.b\n"
"incw x21\n"
- "fmla z27.s, p3/M, z1.s, z6.s\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "addvl x20, x20, #1\n"
"fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
- "incw x12\n"
- "fmla z26.s, p3/M, z1.s, z8.s\n"
- "fmla z30.s, p3/M, z1.s, z13.s\n"
- "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
- "mov p0.b, p2.b\n"
- "fmla z27.s, p3/M, z2.s, z9.s\n"
- "fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z8.s\n"
+ "fmla z26.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z21.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x14, x28, LSL #2]\n"
"addvl x14, x14, #1\n"
- "fmla z26.s, p3/M, z2.s, z13.s\n"
- "fmla z30.s, p3/M, z2.s, z24.s\n"
- "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
- "addvl x11, x11, #1\n"
- "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z13.s\n"
+ "fmla z26.s, p3/M, z2.s, z25.s\n"
+ "ld1w { z16.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z20.s }, p2/Z, [x27, x8, LSL #2]\n"
"fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
- "incw x20\n"
- "fmla z26.s, p3/M, z3.s, z24.s\n"
- "fmla z30.s, p3/M, z3.s, z23.s\n"
- "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z27.s, p3/M, z4.s, z12.s\n"
- "fmla z31.s, p3/M, z4.s, z16.s\n"
- "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
- "fmla z26.s, p3/M, z4.s, z23.s\n"
- "fmla z30.s, p3/M, z4.s, z10.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z27.s, p3/M, z18.s, z7.s\n"
- "fmla z31.s, p3/M, z18.s, z8.s\n"
- "ld1w { z7.s }, p1/Z, [x11]\n"
- "fmla z26.s, p3/M, z18.s, z14.s\n"
- "fmla z30.s, p3/M, z18.s, z0.s\n"
- "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z27.s, p3/M, z22.s, z8.s\n"
- "fmla z31.s, p3/M, z22.s, z13.s\n"
- "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
- "fmla z26.s, p3/M, z22.s, z0.s\n"
- "fmla z30.s, p3/M, z22.s, z19.s\n"
- "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z27.s, p3/M, z20.s, z13.s\n"
- "fmla z31.s, p3/M, z20.s, z24.s\n"
- "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
- "addvl x9, x9, #1\n"
- "fmla z26.s, p3/M, z20.s, z19.s\n"
- "fmla z30.s, p3/M, z20.s, z5.s\n"
- "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
- "fmla z27.s, p3/M, z17.s, z24.s\n"
- "fmla z31.s, p3/M, z17.s, z23.s\n"
- "ld1w { z25.s }, p2/Z, [x27]\n"
- "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
- "fmla z26.s, p3/M, z17.s, z5.s\n"
- "fmla z30.s, p3/M, z17.s, z2.s\n"
- "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "fmla z27.s, p3/M, z21.s, z23.s\n"
- "fmla z31.s, p3/M, z21.s, z10.s\n"
- "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z21.s, z2.s\n"
- "fmla z30.s, p3/M, z21.s, z3.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "fmla z27.s, p3/M, z18.s, z14.s\n"
- "fmla z31.s, p3/M, z18.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
- "fmla z26.s, p3/M, z18.s, z25.s\n"
- "fmla z30.s, p3/M, z18.s, z24.s\n"
- "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
- "fmla z27.s, p3/M, z8.s, z0.s\n"
- "fmla z31.s, p3/M, z8.s, z19.s\n"
- "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z26.s, p3/M, z8.s, z24.s\n"
- "fmla z30.s, p3/M, z8.s, z22.s\n"
- "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
- "fmla z27.s, p3/M, z16.s, z19.s\n"
- "fmla z31.s, p3/M, z16.s, z5.s\n"
- "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z25.s\n"
+ "fmla z26.s, p3/M, z3.s, z22.s\n"
+ "ld1w { z17.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x12, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z22.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z0.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z30.s, p3/M, z23.s, z7.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "fmla z31.s, p3/M, z23.s, z8.s\n"
+ "fmla z27.s, p3/M, z23.s, z14.s\n"
+ "fmla z26.s, p3/M, z23.s, z20.s\n"
+ "ld1w { z18.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z8.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z21.s, z13.s\n"
+ "fmla z27.s, p3/M, z21.s, z20.s\n"
+ "fmla z26.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z13.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x9, LSL #2]\n"
"addvl x27, x27, #1\n"
- "fmla z26.s, p3/M, z16.s, z22.s\n"
- "fmla z30.s, p3/M, z16.s, z0.s\n"
- "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
- "fmla z27.s, p3/M, z17.s, z5.s\n"
- "fmla z31.s, p3/M, z17.s, z2.s\n"
+ "fmla z31.s, p3/M, z16.s, z25.s\n"
+ "fmla z27.s, p3/M, z16.s, z19.s\n"
+ "fmla z26.s, p3/M, z16.s, z12.s\n"
+ "ld1w { z16.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "fmla z30.s, p3/M, z17.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x26]\n"
+ "fmla z31.s, p3/M, z17.s, z22.s\n"
+ "fmla z27.s, p3/M, z17.s, z12.s\n"
+ "ld1w { z29.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z26.s, p3/M, z17.s, z24.s\n"
+ "ld1w { z17.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "fmla z30.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z23.s }, p2/Z, [x26, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z22.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z24.s\n"
+ "fmla z26.s, p3/M, z0.s, z1.s\n"
+ "ld1w { z21.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "fmla z30.s, p3/M, z18.s, z14.s\n"
+ "ld1w { z10.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z18.s, z20.s\n"
+ "fmla z27.s, p3/M, z18.s, z25.s\n"
+ "fmla z26.s, p3/M, z18.s, z23.s\n"
+ "ld1w { z6.s }, p3/Z, [x11, #-6, MUL VL]\n"
+ "fmla z30.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z0.s }, p2/Z, [x26, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "ld1w { z20.s }, p3/Z, [x11, #-5, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z31.s, p3/M, z16.s, z12.s\n"
+ "fmla z27.s, p3/M, z16.s, z22.s\n"
+ "fmla z26.s, p3/M, z16.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x11, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z17.s, z12.s\n"
"ld1w { z16.s }, p2/Z, [x25]\n"
- "fmla z26.s, p3/M, z17.s, z0.s\n"
- "fmla z30.s, p3/M, z17.s, z19.s\n"
- "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
- "fmla z27.s, p3/M, z21.s, z2.s\n"
- "fmla z31.s, p3/M, z21.s, z3.s\n"
- "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
- "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
- "fmla z26.s, p3/M, z21.s, z19.s\n"
- "fmla z30.s, p3/M, z21.s, z1.s\n"
- "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
- "fmla z27.s, p3/M, z23.s, z25.s\n"
- "fmla z31.s, p3/M, z23.s, z24.s\n"
- "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z23.s, z16.s\n"
- "fmla z30.s, p3/M, z23.s, z4.s\n"
- "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z17.s, z24.s\n"
+ "fmla z27.s, p3/M, z17.s, z0.s\n"
+ "fmla z26.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z17.s }, p3/Z, [x11, #-3, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z24.s\n"
+ "ld1w { z9.s }, p2/Z, [x25, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z8.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z27.s, p3/M, z21.s, z19.s\n"
+ "fmla z26.s, p3/M, z21.s, z10.s\n"
+ "ld1w { z5.s }, p3/Z, [x11, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x12, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z23.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z4.s }, p3/Z, [x11, #-1, MUL VL]\n"
+ "fmla z30.s, p3/M, z20.s, z23.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x10, LSL #2]\n"
"fmla z31.s, p3/M, z20.s, z22.s\n"
- "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z26.s, p3/M, z20.s, z4.s\n"
- "fmla z30.s, p3/M, z20.s, z25.s\n"
- "ld1w { z23.s }, p3/Z, [x10]\n"
- "fmla z27.s, p3/M, z18.s, z22.s\n"
- "fmla z31.s, p3/M, z18.s, z0.s\n"
- "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z27.s, p3/M, z20.s, z9.s\n"
+ "fmla z26.s, p3/M, z20.s, z25.s\n"
+ "ld1w { z23.s }, p3/Z, [x11]\n"
+ "fmla z30.s, p3/M, z18.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x28, LSL #2]\n"
"addvl x25, x25, #1\n"
- "fmla z26.s, p3/M, z18.s, z25.s\n"
- "fmla z30.s, p3/M, z18.s, z24.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z27.s, p3/M, z17.s, z0.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "fmla z27.s, p3/M, z18.s, z25.s\n"
+ "fmla z26.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z17.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [x24]\n"
"fmla z31.s, p3/M, z17.s, z19.s\n"
- "ld1w { z18.s }, p2/Z, [x23]\n"
- "fmla z26.s, p3/M, z17.s, z24.s\n"
- "fmla z30.s, p3/M, z17.s, z8.s\n"
- "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
- "fmla z27.s, p3/M, z13.s, z19.s\n"
- "fmla z31.s, p3/M, z13.s, z1.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
- "ld1w { z14.s }, p1/Z, [x9]\n"
- "fmla z26.s, p3/M, z13.s, z8.s\n"
- "fmla z30.s, p3/M, z13.s, z22.s\n"
- "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
- "fmla z27.s, p3/M, z5.s, z16.s\n"
- "fmla z31.s, p3/M, z5.s, z4.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z5.s, z18.s\n"
- "fmla z30.s, p3/M, z5.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
- "fmla z27.s, p3/M, z23.s, z4.s\n"
+ "fmla z27.s, p3/M, z17.s, z24.s\n"
+ "fmla z26.s, p3/M, z17.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z14.s }, p1/Z, [x27]\n"
+ "fmla z27.s, p3/M, z5.s, z8.s\n"
+ "fmla z26.s, p3/M, z5.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x12, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z0.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "fmla z30.s, p3/M, z23.s, z9.s\n"
+ "ld1w { z13.s }, p1/Z, [x20, x12, LSL #2]\n"
"fmla z31.s, p3/M, z23.s, z25.s\n"
- "ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
- "fmla z26.s, p3/M, z23.s, z17.s\n"
- "fmla z30.s, p3/M, z23.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
- "fmla z27.s, p3/M, z21.s, z25.s\n"
- "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "fmla z27.s, p3/M, z23.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z1.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z25.s\n"
"ld1w { z5.s }, p1/Z, [x14]\n"
- "fmla z26.s, p3/M, z21.s, z16.s\n"
- "fmla z30.s, p3/M, z21.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
- "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "fmla z27.s, p3/M, z21.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "cmp x17, %x[n_channels]\n"
+ "addvl x24, x24, #1\n"
+ "fmla z26.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z2.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "fmla z30.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z6.s }, p1/Z, [x14, x8, LSL #2]\n"
"fmla z31.s, p3/M, z20.s, z8.s\n"
- "addvl x10, x10, #16\n"
- "whilelt p2.s, x21, %x[n_channels]\n"
- "fmla z26.s, p3/M, z20.s, z18.s\n"
- "fmla z30.s, p3/M, z20.s, z17.s\n"
- "cmp x12, %x[n_channels]\n"
- "addvl x23, x23, #1\n"
- "fmla z27.s, p3/M, z19.s, z8.s\n"
+ "fmla z27.s, p3/M, z20.s, z18.s\n"
+ "ld1w { z11.s }, p1/Z, [x14, x10, LSL #2]\n"
+ "fmla z26.s, p3/M, z20.s, z17.s\n"
+ "ld1w { z3.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "fmla z30.s, p3/M, z19.s, z8.s\n"
+ "ld1w { z8.s }, p1/Z, [x20, x8, LSL #2]\n"
"fmla z31.s, p3/M, z19.s, z22.s\n"
- "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "ld1w { z10.s }, p1/Z, [x20, x28, LSL #2]\n"
+ "fmla z27.s, p3/M, z19.s, z17.s\n"
+ "ld1w { z12.s }, p1/Z, [x14, x9, LSL #2]\n"
+ "fmla z26.s, p3/M, z19.s, z16.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x12, LSL #2]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
"fmax z31.s, p3/M, z31.s, z15.s\n"
- "fmla z26.s, p3/M, z19.s, z17.s\n"
- "fmla z30.s, p3/M, z19.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
"fmax z26.s, p3/M, z26.s, z15.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
"fmin z31.s, p3/M, z31.s, z28.s\n"
- "ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
- "ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
"fmin z26.s, p3/M, z26.s, z28.s\n"
- "fmin z30.s, p3/M, z30.s, z28.s\n"
- "ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
- "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
- "st1w { z27.s }, p0, [x13]\n"
+ "st1w { z30.s }, p0, [x13]\n"
"st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
"addvl x13, x13, #1\n"
- "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
- "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
- "st1w { z26.s }, p0, [x22]\n"
- "addvl x10, x10, #-6\n"
- "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
- "addvl x22, x22, #1\n"
+ "st1w { z27.s }, p0, [x23]\n"
+ "st1w { z26.s }, p0, [x23, x16, LSL #2]\n"
+ "addvl x23, x23, #1\n"
"blt 2b\n"
"3:" // Tile loop: Channel tail
"movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
"movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
- "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
- "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ld1w { z22.s }, p2/Z, [x20, x10, LSL #2]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n"
"movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
"fmla z29.s, p3/M, z0.s, z8.s\n"
- "ld1w { z20.s }, p3/Z, [x10]\n"
- "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ld1w { z20.s }, p3/Z, [x11]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "mov p0.b, p2.b\n"
+ "add x7, x7, #0x1\n"
"fmla z30.s, p3/M, z1.s, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x20, x9, LSL #2]\n"
"fmla z31.s, p3/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x20, x6, #0x1\n"
"fmla z5.s, p3/M, z1.s, z8.s\n"
"fmla z29.s, p3/M, z1.s, z13.s\n"
- "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
- "add x8, x8, #0x1\n"
+ "ld1w { z19.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "cmp x7, x22\n"
+ "csel x6, x6, x20, LT\n"
+ "csel x7, x7, XZR, LT\n"
"fmla z30.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x28, LSL #2]\n"
"fmla z31.s, p3/M, z2.s, z11.s\n"
- "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
- "cmp x8, x20\n"
"fmla z5.s, p3/M, z2.s, z13.s\n"
"fmla z29.s, p3/M, z2.s, z22.s\n"
- "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
- "add x21, x12, #0x1\n"
+ "ld1w { z18.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "cmp x6, x21\n"
"fmla z30.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, x8, LSL #2]\n"
"fmla z31.s, p3/M, z3.s, z12.s\n"
- "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
- "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
"fmla z5.s, p3/M, z3.s, z22.s\n"
"fmla z29.s, p3/M, z3.s, z6.s\n"
- "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
- "csel x12, x12, x21, LT\n"
+ "ld1w { z17.s }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z30.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, x12, LSL #2]\n"
"fmla z31.s, p3/M, z4.s, z16.s\n"
- "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x27, x10, LSL #2]\n"
"fmla z5.s, p3/M, z4.s, z6.s\n"
"fmla z29.s, p3/M, z4.s, z10.s\n"
- "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
- "mov p0.b, p2.b\n"
+ "ld1w { z16.s }, p3/Z, [x11, #4, MUL VL]\n"
"fmla z30.s, p3/M, z20.s, z7.s\n"
"fmla z31.s, p3/M, z20.s, z8.s\n"
- "csel x8, x8, XZR, LT\n"
- "cmp x12, x20\n"
"fmla z5.s, p3/M, z20.s, z14.s\n"
"fmla z29.s, p3/M, z20.s, z1.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x11, #5, MUL VL]\n"
"fmla z30.s, p3/M, z19.s, z8.s\n"
+ "ld1w { z26.s }, p2/Z, [x27, x28, LSL #2]\n"
"fmla z31.s, p3/M, z19.s, z13.s\n"
- "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
"fmla z5.s, p3/M, z19.s, z1.s\n"
"fmla z29.s, p3/M, z19.s, z0.s\n"
- "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "ld1w { z25.s }, p3/Z, [x11, #6, MUL VL]\n"
"fmla z30.s, p3/M, z18.s, z13.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x9, LSL #2]\n"
"fmla z31.s, p3/M, z18.s, z22.s\n"
- "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
"fmla z5.s, p3/M, z18.s, z0.s\n"
"fmla z29.s, p3/M, z18.s, z27.s\n"
- "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
- "addvl x10, x10, #16\n"
+ "ld1w { z23.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
"fmla z30.s, p3/M, z17.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x26]\n"
"fmla z31.s, p3/M, z17.s, z6.s\n"
- "ld1w { z22.s }, p2/Z, [x27]\n"
"fmla z5.s, p3/M, z17.s, z27.s\n"
"fmla z29.s, p3/M, z17.s, z24.s\n"
- "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1w { z20.s }, p3/Z, [x11, #-8, MUL VL]\n"
"fmla z30.s, p3/M, z16.s, z6.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x8, LSL #2]\n"
"fmla z31.s, p3/M, z16.s, z10.s\n"
- "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x26, x12, LSL #2]\n"
"fmla z5.s, p3/M, z16.s, z24.s\n"
"fmla z29.s, p3/M, z16.s, z26.s\n"
- "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1w { z16.s }, p3/Z, [x11, #-7, MUL VL]\n"
"fmla z30.s, p3/M, z21.s, z14.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x28, LSL #2]\n"
"fmla z31.s, p3/M, z21.s, z1.s\n"
- "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
"fmla z5.s, p3/M, z21.s, z22.s\n"
- "fmla z29.s, p3/M, z21.s, z19.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z29.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z21.s }, p3/Z, [x11, #-6, MUL VL]\n"
"fmla z30.s, p3/M, z25.s, z1.s\n"
+ "ld1w { z8.s }, p2/Z, [x26, x10, LSL #2]\n"
"fmla z31.s, p3/M, z25.s, z0.s\n"
- "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
- "fmla z5.s, p3/M, z25.s, z19.s\n"
- "fmla z29.s, p3/M, z25.s, z18.s\n"
- "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z5.s, p3/M, z25.s, z18.s\n"
+ "fmla z29.s, p3/M, z25.s, z17.s\n"
+ "ld1w { z9.s }, p3/Z, [x11, #-5, MUL VL]\n"
"fmla z30.s, p3/M, z23.s, z0.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x9, LSL #2]\n"
"fmla z31.s, p3/M, z23.s, z27.s\n"
- "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
- "fmla z5.s, p3/M, z23.s, z18.s\n"
- "fmla z29.s, p3/M, z23.s, z7.s\n"
- "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z5.s, p3/M, z23.s, z17.s\n"
+ "fmla z29.s, p3/M, z23.s, z8.s\n"
+ "ld1w { z6.s }, p3/Z, [x11, #-4, MUL VL]\n"
"fmla z30.s, p3/M, z20.s, z27.s\n"
- "fmla z31.s, p3/M, z20.s, z24.s\n"
"ld1w { z0.s }, p2/Z, [x25]\n"
- "fmla z5.s, p3/M, z20.s, z7.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "fmla z5.s, p3/M, z20.s, z8.s\n"
"fmla z29.s, p3/M, z20.s, z11.s\n"
- "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #-3, MUL VL]\n"
"fmla z30.s, p3/M, z16.s, z24.s\n"
+ "ld1w { z2.s }, p2/Z, [x25, x8, LSL #2]\n"
"fmla z31.s, p3/M, z16.s, z26.s\n"
- "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x25, x9, LSL #2]\n"
"fmla z5.s, p3/M, z16.s, z11.s\n"
- "fmla z29.s, p3/M, z16.s, z17.s\n"
- "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z29.s, p3/M, z16.s, z19.s\n"
+ "ld1w { z16.s }, p3/Z, [x11, #-2, MUL VL]\n"
"fmla z30.s, p3/M, z21.s, z22.s\n"
- "fmla z31.s, p3/M, z21.s, z19.s\n"
- "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x25, x12, LSL #2]\n"
+ "fmla z31.s, p3/M, z21.s, z18.s\n"
"fmla z5.s, p3/M, z21.s, z0.s\n"
- "fmla z29.s, p3/M, z21.s, z3.s\n"
- "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
- "fmla z30.s, p3/M, z10.s, z19.s\n"
- "fmla z31.s, p3/M, z10.s, z18.s\n"
- "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
- "fmla z5.s, p3/M, z10.s, z3.s\n"
- "fmla z29.s, p3/M, z10.s, z26.s\n"
- "ld1w { z23.s }, p3/Z, [x10]\n"
- "fmla z30.s, p3/M, z6.s, z18.s\n"
- "fmla z31.s, p3/M, z6.s, z7.s\n"
- "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z29.s, p3/M, z21.s, z2.s\n"
+ "ld1w { z25.s }, p3/Z, [x11, #-1, MUL VL]\n"
+ "fmla z30.s, p3/M, z9.s, z18.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z9.s, z17.s\n"
+ "fmla z5.s, p3/M, z9.s, z2.s\n"
+ "fmla z29.s, p3/M, z9.s, z26.s\n"
+ "ld1w { z23.s }, p3/Z, [x11]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z8.s\n"
"fmla z5.s, p3/M, z6.s, z26.s\n"
"fmla z29.s, p3/M, z6.s, z24.s\n"
- "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
- "fmla z30.s, p3/M, z9.s, z7.s\n"
- "fmla z31.s, p3/M, z9.s, z11.s\n"
- "ld1w { z18.s }, p2/Z, [x23]\n"
- "fmla z5.s, p3/M, z9.s, z24.s\n"
- "fmla z29.s, p3/M, z9.s, z27.s\n"
- "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z21.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z4.s, z8.s\n"
+ "ld1w { z18.s }, p2/Z, [x24]\n"
+ "fmla z31.s, p3/M, z4.s, z11.s\n"
+ "fmla z5.s, p3/M, z4.s, z24.s\n"
+ "fmla z29.s, p3/M, z4.s, z27.s\n"
+ "ld1w { z20.s }, p3/Z, [x11, #2, MUL VL]\n"
"fmla z30.s, p3/M, z16.s, z11.s\n"
- "fmla z31.s, p3/M, z16.s, z17.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z16.s, z19.s\n"
"fmla z5.s, p3/M, z16.s, z27.s\n"
"fmla z29.s, p3/M, z16.s, z22.s\n"
- "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z19.s }, p3/Z, [x11, #3, MUL VL]\n"
"fmla z30.s, p3/M, z25.s, z0.s\n"
- "fmla z31.s, p3/M, z25.s, z3.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x24, x12, LSL #2]\n"
+ "fmla z31.s, p3/M, z25.s, z2.s\n"
"fmla z5.s, p3/M, z25.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x10, LSL #2]\n"
"fmla z29.s, p3/M, z25.s, z17.s\n"
- "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
- "fmla z30.s, p3/M, z23.s, z3.s\n"
+ "fmla z30.s, p3/M, z23.s, z2.s\n"
"fmla z31.s, p3/M, z23.s, z26.s\n"
"fmla z5.s, p3/M, z23.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x9, LSL #2]\n"
"fmla z29.s, p3/M, z23.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
"fmla z30.s, p3/M, z21.s, z26.s\n"
"fmla z31.s, p3/M, z21.s, z24.s\n"
"fmla z5.s, p3/M, z21.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x28, LSL #2]\n"
"fmla z29.s, p3/M, z21.s, z18.s\n"
- "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
"fmla z30.s, p3/M, z20.s, z24.s\n"
"fmla z31.s, p3/M, z20.s, z27.s\n"
"fmla z5.s, p3/M, z20.s, z18.s\n"
"fmla z29.s, p3/M, z20.s, z17.s\n"
"fmla z30.s, p3/M, z19.s, z27.s\n"
"fmla z31.s, p3/M, z19.s, z22.s\n"
- "fmax z30.s, p3/M, z30.s, z15.s\n"
- "fmax z31.s, p3/M, z31.s, z15.s\n"
"fmla z5.s, p3/M, z19.s, z17.s\n"
"fmla z29.s, p3/M, z19.s, z16.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
"fmax z5.s, p3/M, z5.s, z15.s\n"
- "fmax z29.s, p3/M, z29.s, z15.s\n"
"fmin z30.s, p3/M, z30.s, z28.s\n"
"fmin z31.s, p3/M, z31.s, z28.s\n"
- "st1w { z30.s }, p0, [x13]\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
"fmin z5.s, p3/M, z5.s, z28.s\n"
+ "st1w { z30.s }, p0, [x13]\n"
"fmin z29.s, p3/M, z29.s, z28.s\n"
"st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
- "st1w { z5.s }, p0, [x22]\n"
- "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
+ "st1w { z5.s }, p0, [x23]\n"
+ "st1w { z29.s }, p0, [x23, x16, LSL #2]\n"
"blt 1b\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
index bf65e04d32..517ebae6e1 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,449 +99,449 @@ void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
__asm__ __volatile__(
"ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
- "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
- "ldp x15, x14, [x20, #0x0]\n"
- "mov x13, #0x0\n"
- "ldp x12, x11, [x20, #0x10]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x16, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
"whilelt p3.s, XZR, %x[n_channels]\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "cntw x10\n"
+ "cntw x14\n"
"ptrue p2.b\n"
- "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
- "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
- "cmp x10, %x[n_channels]\n"
- "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldp x27, x26, [x16, #0x10]\n"
- "sub x28, XZR, x10\n"
- "ldp x25, x24, [x16, #0x20]\n"
- "ldp x23, x22, [x16, #0x30]\n"
- "ldp x21, x20, [x16, #0x40]\n"
- "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
- "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
- "ld1w { z29.s }, p2/Z, [x9]\n"
- "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
- "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
- "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
- "addvl x9, x9, #6\n"
- "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
- "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "cmp x14, %x[n_channels]\n"
+ "sub x9, XZR, x14\n"
+ "ld1rw { z17.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z30.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z5.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "ld1w { z29.s }, p2/Z, [x15]\n"
+ "ld1w { z0.s }, p2/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x15, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p3/Z, [x27, x16, LSL #2]\n"
+ "addvl x15, x15, #6\n"
+ "ld1w { z8.s }, p3/Z, [x26, x16, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "ld1w { z11.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z12.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z14.s }, p3/Z, [x20, x16, LSL #2]\n"
"bge 2f\n"
"1:" // Channel loop
- "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
- "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
- "ldr x20, [x16, #0x50]\n"
- "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
- "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
- "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
- "ldr x20, [x16, #0x58]\n"
- "ldr x21, [x16, #0x60]\n"
- "fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z27.s, p2/M, z1.s, z9.s\n"
- "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x16, #0x68]\n"
- "fmla z31.s, p2/M, z1.s, z8.s\n"
- "fmla z26.s, p2/M, z1.s, z13.s\n"
- "ld1w { z21.s }, p2/Z, [x9]\n"
- "ldr x23, [x16, #0x70]\n"
- "fmla z30.s, p2/M, z2.s, z9.s\n"
- "fmla z27.s, p2/M, z2.s, z11.s\n"
- "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z31.s, p2/M, z2.s, z13.s\n"
- "fmla z26.s, p2/M, z2.s, z5.s\n"
- "ldr x22, [x16, #0x78]\n"
- "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z27.s, p2/M, z3.s, z12.s\n"
- "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x21, [x16, #0x80]\n"
- "fmla z31.s, p2/M, z3.s, z5.s\n"
- "fmla z26.s, p2/M, z3.s, z22.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
- "ldr x20, [x16, #0x88]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z27.s, p2/M, z4.s, z20.s\n"
- "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z4.s, z22.s\n"
- "fmla z26.s, p2/M, z4.s, z10.s\n"
- "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
- "ldr x23, [x16, #0x90]\n"
- "fmla z30.s, p2/M, z21.s, z7.s\n"
- "fmla z27.s, p2/M, z21.s, z8.s\n"
- "ldr x26, [x16, #0x98]\n"
- "ldr x22, [x16, #0xa0]\n"
- "fmla z31.s, p2/M, z21.s, z14.s\n"
- "fmla z26.s, p2/M, z21.s, z11.s\n"
- "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z30.s, p2/M, z18.s, z8.s\n"
- "fmla z27.s, p2/M, z18.s, z13.s\n"
- "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla z31.s, p2/M, z18.s, z11.s\n"
- "fmla z26.s, p2/M, z18.s, z0.s\n"
- "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.s, p2/M, z17.s, z13.s\n"
- "fmla z27.s, p2/M, z17.s, z5.s\n"
- "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0xc0]\n"
- "fmla z31.s, p2/M, z17.s, z0.s\n"
- "fmla z26.s, p2/M, z17.s, z29.s\n"
- "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "fmla z30.s, p2/M, z16.s, z5.s\n"
- "fmla z27.s, p2/M, z16.s, z22.s\n"
- "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ldr x27, [x16, #0xc8]\n"
- "fmla z31.s, p2/M, z16.s, z29.s\n"
- "fmla z26.s, p2/M, z16.s, z3.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
- "ldr x23, [x16, #0xd0]\n"
- "fmla z30.s, p2/M, z19.s, z22.s\n"
- "fmla z27.s, p2/M, z19.s, z10.s\n"
- "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z19.s, z3.s\n"
- "fmla z26.s, p2/M, z19.s, z24.s\n"
- "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
- "ldr x22, [x16, #0xd8]\n"
- "fmla z30.s, p2/M, z25.s, z14.s\n"
- "fmla z27.s, p2/M, z25.s, z11.s\n"
- "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x16, #0xe0]\n"
- "fmla z31.s, p2/M, z25.s, z6.s\n"
- "fmla z26.s, p2/M, z25.s, z23.s\n"
- "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
- "ldr x26, [x16, #0xf8]\n"
- "fmla z30.s, p2/M, z18.s, z11.s\n"
- "fmla z27.s, p2/M, z18.s, z0.s\n"
- "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0xe8]\n"
- "fmla z31.s, p2/M, z18.s, z23.s\n"
- "fmla z26.s, p2/M, z18.s, z22.s\n"
- "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
- "whilelt p1.s, x10, %x[n_channels]\n"
- "fmla z30.s, p2/M, z17.s, z0.s\n"
- "fmla z27.s, p2/M, z17.s, z29.s\n"
- "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x16, #0xf0]\n"
- "fmla z31.s, p2/M, z17.s, z22.s\n"
- "fmla z26.s, p2/M, z17.s, z7.s\n"
- "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
- "incw x28\n"
- "fmla z30.s, p2/M, z16.s, z29.s\n"
- "fmla z27.s, p2/M, z16.s, z3.s\n"
- "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0x100]\n"
- "fmla z31.s, p2/M, z16.s, z7.s\n"
- "fmla z26.s, p2/M, z16.s, z19.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "movprfx z15, z29\n fmla z15.s, p2/M, z0.s, z5.s\n"
+ "movprfx z28, z29\n fmla z28.s, p2/M, z0.s, z6.s\n"
+ "ldr x21, [x17, #0x50]\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z7.s\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z8.s\n"
+ "ldr x22, [x17, #0x60]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "ld1w { z19.s }, p2/Z, [x15]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "incw x9\n"
+ "ld1w { z25.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x21, [x17, #0x78]\n"
"mov p0.b, p3.b\n"
- "fmla z30.s, p2/M, z21.s, z3.s\n"
- "fmla z27.s, p2/M, z21.s, z24.s\n"
- "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
- "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z21.s, z19.s\n"
- "fmla z26.s, p2/M, z21.s, z1.s\n"
- "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
- "ldr x20, [x16, #0x108]\n"
- "fmla z30.s, p2/M, z20.s, z6.s\n"
- "fmla z27.s, p2/M, z20.s, z23.s\n"
- "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x16, #0x110]\n"
- "fmla z31.s, p2/M, z20.s, z0.s\n"
- "fmla z26.s, p2/M, z20.s, z11.s\n"
- "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
- "fmla z30.s, p2/M, z18.s, z23.s\n"
- "fmla z27.s, p2/M, z18.s, z22.s\n"
- "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x22, [x16, #0x118]\n"
- "fmla z31.s, p2/M, z18.s, z11.s\n"
- "fmla z26.s, p2/M, z18.s, z25.s\n"
- "ld1w { z23.s }, p2/Z, [x9]\n"
- "fmla z30.s, p2/M, z17.s, z22.s\n"
- "fmla z27.s, p2/M, z17.s, z7.s\n"
- "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z17.s, z25.s\n"
- "fmla z26.s, p2/M, z17.s, z24.s\n"
- "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z16.s, z7.s\n"
- "fmla z27.s, p2/M, z16.s, z19.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z16.s, z24.s\n"
- "fmla z26.s, p2/M, z16.s, z13.s\n"
- "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.s, p2/M, z10.s, z19.s\n"
- "fmla z27.s, p2/M, z10.s, z1.s\n"
- "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z10.s, z13.s\n"
- "fmla z26.s, p2/M, z10.s, z22.s\n"
- "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
- "fmla z30.s, p2/M, z8.s, z0.s\n"
- "fmla z27.s, p2/M, z8.s, z11.s\n"
- "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z31.s, p2/M, z8.s, z18.s\n"
- "fmla z26.s, p2/M, z8.s, z17.s\n"
- "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldp x21, x20, [x16, #0x0]\n"
- "fmla z30.s, p2/M, z23.s, z11.s\n"
- "fmla z27.s, p2/M, z23.s, z25.s\n"
- "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
- "fmla z31.s, p2/M, z23.s, z17.s\n"
- "fmla z26.s, p2/M, z23.s, z16.s\n"
- "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
- "fmla z30.s, p2/M, z21.s, z25.s\n"
- "fmla z27.s, p2/M, z21.s, z24.s\n"
- "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
- "fmla z31.s, p2/M, z21.s, z16.s\n"
- "fmla z26.s, p2/M, z21.s, z18.s\n"
- "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldp x27, x26, [x16, #0x10]\n"
- "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z15.s, p2/M, z1.s, z6.s\n"
+ "fmla z28.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z23.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "fmla z27.s, p2/M, z1.s, z8.s\n"
+ "fmla z31.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z22.s }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ldr x23, [x17, #0x90]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "fmla z15.s, p2/M, z2.s, z9.s\n"
+ "ld1w { z18.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ "fmla z28.s, p2/M, z2.s, z11.s\n"
+ "fmla z27.s, p2/M, z2.s, z13.s\n"
+ "fmla z31.s, p2/M, z2.s, z25.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z15.s, p2/M, z3.s, z11.s\n"
+ "ld1w { z2.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z28.s, p2/M, z3.s, z12.s\n"
+ "fmla z27.s, p2/M, z3.s, z25.s\n"
+ "fmla z31.s, p2/M, z3.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z15.s, p2/M, z4.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z28.s, p2/M, z4.s, z18.s\n"
+ "ld1w { z0.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x21, [x17, #0xb8]\n"
+ "fmla z27.s, p2/M, z4.s, z23.s\n"
+ "fmla z31.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z3.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z15.s, p2/M, z19.s, z7.s\n"
+ "fmla z28.s, p2/M, z19.s, z8.s\n"
+ "fmla z27.s, p2/M, z19.s, z14.s\n"
+ "fmla z31.s, p2/M, z19.s, z2.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z15.s, p2/M, z22.s, z8.s\n"
+ "ld1w { z26.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z28.s, p2/M, z22.s, z13.s\n"
+ "fmla z27.s, p2/M, z22.s, z2.s\n"
+ "fmla z31.s, p2/M, z22.s, z1.s\n"
+ "ld1w { z19.s }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z15.s, p2/M, z16.s, z13.s\n"
+ "ld1w { z9.s }, p3/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x17, #0xc0]\n"
+ "fmla z28.s, p2/M, z16.s, z25.s\n"
+ "fmla z27.s, p2/M, z16.s, z1.s\n"
+ "fmla z31.s, p2/M, z16.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z15.s, p2/M, z21.s, z25.s\n"
+ "ld1w { z25.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "ldr x23, [x17, #0xd0]\n"
+ "fmla z28.s, p2/M, z21.s, z23.s\n"
+ "ld1w { z29.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z27.s, p2/M, z21.s, z0.s\n"
+ "fmla z31.s, p2/M, z21.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z15.s, p2/M, z3.s, z23.s\n"
+ "ld1w { z24.s }, p3/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z28.s, p2/M, z3.s, z10.s\n"
+ "ld1w { z23.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x17, #0xe0]\n"
+ "fmla z27.s, p2/M, z3.s, z9.s\n"
+ "fmla z31.s, p2/M, z3.s, z26.s\n"
+ "ld1w { z22.s }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z15.s, p2/M, z20.s, z14.s\n"
+ "ld1w { z6.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z28.s, p2/M, z20.s, z2.s\n"
+ "fmla z27.s, p2/M, z20.s, z25.s\n"
+ "fmla z31.s, p2/M, z20.s, z24.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z15.s, p2/M, z19.s, z2.s\n"
+ "ld1w { z21.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z28.s, p2/M, z19.s, z1.s\n"
+ "fmla z27.s, p2/M, z19.s, z24.s\n"
+ "fmla z31.s, p2/M, z19.s, z23.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z15.s, p2/M, z18.s, z1.s\n"
+ "ld1w { z19.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z28.s, p2/M, z18.s, z0.s\n"
+ "fmla z27.s, p2/M, z18.s, z23.s\n"
+ "fmla z31.s, p2/M, z18.s, z21.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z15.s, p2/M, z16.s, z0.s\n"
+ "ld1w { z0.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x17, #0x100]\n"
+ "fmla z28.s, p2/M, z16.s, z9.s\n"
+ "fmla z27.s, p2/M, z16.s, z21.s\n"
+ "fmla z31.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z15.s, p2/M, z22.s, z9.s\n"
+ "ld1w { z12.s }, p3/Z, [x28, x16, LSL #2]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z28.s, p2/M, z22.s, z26.s\n"
+ "ld1w { z4.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "fmla z27.s, p2/M, z22.s, z19.s\n"
+ "fmla z31.s, p2/M, z22.s, z6.s\n"
+ "ld1w { z14.s }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z15.s, p2/M, z10.s, z25.s\n"
+ "ld1w { z26.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "ldr x23, [x17, #0x110]\n"
+ "fmla z28.s, p2/M, z10.s, z24.s\n"
+ "fmla z27.s, p2/M, z10.s, z0.s\n"
+ "fmla z31.s, p2/M, z10.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z15.s, p2/M, z20.s, z24.s\n"
+ "ld1w { z25.s }, p3/Z, [x27, x16, LSL #2]\n"
+ "ldr x22, [x17, #0x118]\n"
+ "fmla z28.s, p2/M, z20.s, z23.s\n"
+ "fmla z27.s, p2/M, z20.s, z12.s\n"
+ "fmla z31.s, p2/M, z20.s, z26.s\n"
+ "ld1w { z24.s }, p2/Z, [x15]\n"
+ "fmla z15.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z23.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "fmla z28.s, p2/M, z18.s, z21.s\n"
+ "fmla z27.s, p2/M, z18.s, z26.s\n"
+ "fmla z31.s, p2/M, z18.s, z25.s\n"
+ "ld1w { z22.s }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z15.s, p2/M, z16.s, z21.s\n"
+ "ld1w { z21.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "fmla z28.s, p2/M, z16.s, z19.s\n"
+ "fmla z27.s, p2/M, z16.s, z25.s\n"
+ "fmla z31.s, p2/M, z16.s, z4.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z15.s, p2/M, z14.s, z19.s\n"
+ "ld1w { z19.s }, p3/Z, [x26, x16, LSL #2]\n"
+ "fmla z28.s, p2/M, z14.s, z6.s\n"
+ "fmla z27.s, p2/M, z14.s, z4.s\n"
+ "fmla z31.s, p2/M, z14.s, z23.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z15.s, p2/M, z10.s, z0.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "fmla z28.s, p2/M, z10.s, z12.s\n"
+ "fmla z27.s, p2/M, z10.s, z21.s\n"
+ "ld1w { z13.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldp x21, x20, [x17, #0x0]\n"
+ "fmla z31.s, p2/M, z10.s, z19.s\n"
+ "ld1w { z0.s }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z15.s, p2/M, z24.s, z12.s\n"
+ "fmla z28.s, p2/M, z24.s, z26.s\n"
+ "fmla z27.s, p2/M, z24.s, z19.s\n"
+ "ld1w { z12.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "fmla z31.s, p2/M, z24.s, z16.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z15.s, p2/M, z22.s, z26.s\n"
+ "ld1w { z5.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "fmla z28.s, p2/M, z22.s, z25.s\n"
+ "fmla z27.s, p2/M, z22.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "incw x16\n"
+ "fmla z31.s, p2/M, z22.s, z13.s\n"
+ "ld1w { z2.s }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z15.s, p2/M, z20.s, z25.s\n"
+ "ld1w { z6.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "ld1w { z7.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "fmla z28.s, p2/M, z20.s, z4.s\n"
"fmla z27.s, p2/M, z20.s, z13.s\n"
- "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
- "ldp x25, x24, [x16, #0x20]\n"
- "fmla z31.s, p2/M, z20.s, z18.s\n"
- "fmla z26.s, p2/M, z20.s, z17.s\n"
- "ldp x23, x22, [x16, #0x30]\n"
- "ldp x21, x20, [x16, #0x40]\n"
- "fmla z30.s, p2/M, z19.s, z13.s\n"
- "fmla z27.s, p2/M, z19.s, z22.s\n"
- "incw x13\n"
- "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
- "fmla z31.s, p2/M, z19.s, z17.s\n"
- "fmla z26.s, p2/M, z19.s, z16.s\n"
- "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
- "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
- "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
- "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
- "fmax z30.s, p2/M, z30.s, z15.s\n"
- "fmax z27.s, p2/M, z27.s, z15.s\n"
- "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
- "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
- "fmax z31.s, p2/M, z31.s, z15.s\n"
- "fmax z26.s, p2/M, z26.s, z15.s\n"
- "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
- "incw x10\n"
- "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "whilelt p3.s, x13, %x[n_channels]\n"
- "cmp x10, %x[n_channels]\n"
- "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
- "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
- "fmin z30.s, p2/M, z30.s, z28.s\n"
- "fmin z27.s, p2/M, z27.s, z28.s\n"
- "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
- "fmin z31.s, p2/M, z31.s, z28.s\n"
- "fmin z26.s, p2/M, z26.s, z28.s\n"
- "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
- "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
- "addvl x9, x9, #-6\n"
- "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "whilelt p3.s, x16, %x[n_channels]\n"
+ "fmla z31.s, p2/M, z20.s, z12.s\n"
+ "ld1w { z3.s }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z15.s, p2/M, z18.s, z4.s\n"
+ "ld1w { z8.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "fmla z28.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z10.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "fmla z27.s, p2/M, z18.s, z12.s\n"
+ "ld1w { z12.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "fmla z31.s, p2/M, z18.s, z16.s\n"
+ "ld1w { z9.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "incw x14\n"
+ "ld1w { z4.s }, p2/Z, [x15, #-7, MUL VL]\n"
+ "addvl x15, x15, #-6\n"
+ "fmax z15.s, p2/M, z15.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "cmp x14, %x[n_channels]\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "fmin z15.s, p2/M, z15.s, z30.s\n"
+ "fmin z28.s, p2/M, z28.s, z30.s\n"
+ "fmin z27.s, p2/M, z27.s, z30.s\n"
+ "fmin z31.s, p2/M, z31.s, z30.s\n"
+ "st1w { z15.s }, p0, [x13, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x12, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x11, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x9, LSL #2]\n"
"blt 1b\n"
"2:" // Channel tail
- "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
- "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
- "ldr x20, [x16, #0x50]\n"
- "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
- "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
- "fmla z29.s, p2/M, z0.s, z8.s\n"
- "ldr x20, [x16, #0x58]\n"
- "ldr x21, [x16, #0x60]\n"
- "fmla z30.s, p2/M, z1.s, z6.s\n"
- "fmla z31.s, p2/M, z1.s, z9.s\n"
- "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x16, #0x68]\n"
- "fmla z5.s, p2/M, z1.s, z8.s\n"
- "fmla z29.s, p2/M, z1.s, z13.s\n"
- "ld1w { z20.s }, p2/Z, [x9]\n"
- "ldr x23, [x16, #0x70]\n"
- "fmla z30.s, p2/M, z2.s, z9.s\n"
- "fmla z31.s, p2/M, z2.s, z11.s\n"
- "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z5.s, p2/M, z2.s, z13.s\n"
- "fmla z29.s, p2/M, z2.s, z22.s\n"
- "ldr x21, [x16, #0x78]\n"
- "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.s, p2/M, z3.s, z11.s\n"
- "fmla z31.s, p2/M, z3.s, z12.s\n"
- "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x22, [x16, #0x80]\n"
- "fmla z5.s, p2/M, z3.s, z22.s\n"
- "fmla z29.s, p2/M, z3.s, z6.s\n"
- "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
- "ldr x20, [x16, #0x88]\n"
- "fmla z30.s, p2/M, z4.s, z12.s\n"
- "fmla z31.s, p2/M, z4.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z4.s, z6.s\n"
- "fmla z29.s, p2/M, z4.s, z10.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
- "ldr x21, [x16, #0x90]\n"
- "fmla z30.s, p2/M, z20.s, z7.s\n"
- "fmla z31.s, p2/M, z20.s, z8.s\n"
- "ldr x27, [x16, #0x98]\n"
- "ldr x26, [x16, #0xa0]\n"
- "fmla z5.s, p2/M, z20.s, z14.s\n"
- "fmla z29.s, p2/M, z20.s, z1.s\n"
- "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
- "ldr x25, [x16, #0xa8]\n"
- "fmla z30.s, p2/M, z19.s, z8.s\n"
- "fmla z31.s, p2/M, z19.s, z13.s\n"
- "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x24, [x16, #0xb0]\n"
- "fmla z5.s, p2/M, z19.s, z1.s\n"
- "fmla z29.s, p2/M, z19.s, z0.s\n"
- "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
- "ldr x20, [x16, #0xb8]\n"
- "fmla z30.s, p2/M, z18.s, z13.s\n"
- "fmla z31.s, p2/M, z18.s, z22.s\n"
- "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ldr x23, [x16, #0xc0]\n"
- "fmla z5.s, p2/M, z18.s, z0.s\n"
- "fmla z29.s, p2/M, z18.s, z27.s\n"
- "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
- "addvl x9, x9, #16\n"
- "fmla z30.s, p2/M, z17.s, z22.s\n"
- "fmla z31.s, p2/M, z17.s, z6.s\n"
- "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x22, [x16, #0xc8]\n"
- "fmla z5.s, p2/M, z17.s, z27.s\n"
- "fmla z29.s, p2/M, z17.s, z24.s\n"
- "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
- "ldr x21, [x16, #0xd0]\n"
- "fmla z30.s, p2/M, z16.s, z6.s\n"
- "fmla z31.s, p2/M, z16.s, z10.s\n"
- "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
- "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z16.s, z24.s\n"
- "fmla z29.s, p2/M, z16.s, z26.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
- "ldr x27, [x16, #0xd8]\n"
- "fmla z30.s, p2/M, z21.s, z14.s\n"
+ "movprfx z16, z29\n fmla z16.s, p2/M, z0.s, z5.s\n"
+ "movprfx z15, z29\n fmla z15.s, p2/M, z0.s, z6.s\n"
+ "ldr x22, [x17, #0x50]\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+ "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z8.s\n"
+ "ldr x20, [x17, #0x60]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "ld1w { z25.s }, p2/Z, [x15]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "incw x9\n"
+ "mov p0.b, p3.b\n"
+ "ld1w { z24.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z16.s, p2/M, z1.s, z6.s\n"
+ "fmla z15.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z23.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x27, [x17, #0x80]\n"
+ "fmla z31.s, p2/M, z1.s, z8.s\n"
+ "fmla z5.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #1, MUL VL]\n"
+ "ldr x22, [x17, #0x88]\n"
+ "ldr x21, [x17, #0x90]\n"
+ "ldr x26, [x17, #0x98]\n"
+ "fmla z16.s, p2/M, z2.s, z9.s\n"
+ "fmla z15.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x17, #0xa0]\n"
+ "fmla z31.s, p2/M, z2.s, z13.s\n"
+ "fmla z5.s, p2/M, z2.s, z24.s\n"
+ "ld1w { z22.s }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z16.s, p2/M, z3.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z15.s, p2/M, z3.s, z12.s\n"
+ "fmla z31.s, p2/M, z3.s, z24.s\n"
+ "fmla z5.s, p2/M, z3.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z16.s, p2/M, z4.s, z12.s\n"
+ "ld1w { z0.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z15.s, p2/M, z4.s, z18.s\n"
+ "ld1w { z29.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z31.s, p2/M, z4.s, z23.s\n"
+ "fmla z5.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "fmla z16.s, p2/M, z25.s, z7.s\n"
+ "fmla z15.s, p2/M, z25.s, z8.s\n"
+ "fmla z31.s, p2/M, z25.s, z14.s\n"
+ "fmla z5.s, p2/M, z25.s, z1.s\n"
+ "ld1w { z18.s }, p2/Z, [x15, #5, MUL VL]\n"
+ "fmla z16.s, p2/M, z20.s, z8.s\n"
+ "ld1w { z28.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldr x28, [x17, #0xc8]\n"
+ "fmla z15.s, p2/M, z20.s, z13.s\n"
+ "fmla z31.s, p2/M, z20.s, z1.s\n"
+ "fmla z5.s, p2/M, z20.s, z0.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #6, MUL VL]\n"
+ "fmla z16.s, p2/M, z22.s, z13.s\n"
+ "ld1w { z27.s }, p3/Z, [x27, x16, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z15.s, p2/M, z22.s, z24.s\n"
+ "fmla z31.s, p2/M, z22.s, z0.s\n"
+ "fmla z5.s, p2/M, z22.s, z29.s\n"
+ "ld1w { z26.s }, p2/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z16.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z25.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z15.s, p2/M, z21.s, z23.s\n"
+ "fmla z31.s, p2/M, z21.s, z29.s\n"
+ "fmla z5.s, p2/M, z21.s, z27.s\n"
+ "ld1w { z24.s }, p2/Z, [x15, #-8, MUL VL]\n"
+ "fmla z16.s, p2/M, z19.s, z23.s\n"
+ "ld1w { z23.s }, p3/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x17, #0xd8]\n"
+ "fmla z15.s, p2/M, z19.s, z10.s\n"
+ "ld1w { z22.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x17, #0xe0]\n"
+ "fmla z31.s, p2/M, z19.s, z27.s\n"
+ "fmla z5.s, p2/M, z19.s, z28.s\n"
+ "ld1w { z19.s }, p2/Z, [x15, #-7, MUL VL]\n"
+ "fmla z16.s, p2/M, z18.s, z14.s\n"
+ "ld1w { z2.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "ldr x26, [x17, #0xf8]\n"
+ "fmla z15.s, p2/M, z18.s, z1.s\n"
+ "fmla z31.s, p2/M, z18.s, z25.s\n"
+ "fmla z5.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x15, #-6, MUL VL]\n"
+ "fmla z16.s, p2/M, z20.s, z1.s\n"
+ "ld1w { z18.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "ldr x25, [x17, #0xe8]\n"
+ "fmla z15.s, p2/M, z20.s, z0.s\n"
+ "fmla z31.s, p2/M, z20.s, z23.s\n"
+ "fmla z5.s, p2/M, z20.s, z22.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #-5, MUL VL]\n"
+ "fmla z16.s, p2/M, z26.s, z0.s\n"
+ "ld1w { z9.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "ldr x24, [x17, #0xf0]\n"
+ "fmla z15.s, p2/M, z26.s, z29.s\n"
+ "fmla z31.s, p2/M, z26.s, z22.s\n"
+ "fmla z5.s, p2/M, z26.s, z18.s\n"
+ "ld1w { z4.s }, p2/Z, [x15, #-4, MUL VL]\n"
+ "fmla z16.s, p2/M, z24.s, z29.s\n"
+ "ld1w { z1.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z15.s, p2/M, z24.s, z27.s\n"
+ "fmla z31.s, p2/M, z24.s, z18.s\n"
+ "fmla z5.s, p2/M, z24.s, z9.s\n"
+ "ld1w { z3.s }, p2/Z, [x15, #-3, MUL VL]\n"
+ "fmla z16.s, p2/M, z19.s, z27.s\n"
+ "ld1w { z0.s }, p3/Z, [x28, x16, LSL #2]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z15.s, p2/M, z19.s, z28.s\n"
+ "ld1w { z29.s }, p3/Z, [x20, x16, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z9.s\n"
+ "fmla z5.s, p2/M, z19.s, z2.s\n"
+ "ld1w { z19.s }, p2/Z, [x15, #-2, MUL VL]\n"
+ "fmla z16.s, p2/M, z21.s, z25.s\n"
+ "ld1w { z28.s }, p3/Z, [x21, x16, LSL #2]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z15.s, p2/M, z21.s, z23.s\n"
"fmla z31.s, p2/M, z21.s, z1.s\n"
- "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
- "ldr x20, [x16, #0xe0]\n"
- "fmla z5.s, p2/M, z21.s, z22.s\n"
- "fmla z29.s, p2/M, z21.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
- "ldr x26, [x16, #0xf8]\n"
- "fmla z30.s, p2/M, z25.s, z1.s\n"
- "fmla z31.s, p2/M, z25.s, z0.s\n"
- "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
- "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.s, p2/M, z21.s, z0.s\n"
+ "ld1w { z27.s }, p2/Z, [x15, #-1, MUL VL]\n"
+ "fmla z16.s, p2/M, z20.s, z23.s\n"
+ "ld1w { z26.s }, p3/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z15.s, p2/M, z20.s, z22.s\n"
+ "fmla z31.s, p2/M, z20.s, z0.s\n"
+ "fmla z5.s, p2/M, z20.s, z28.s\n"
+ "ld1w { z25.s }, p2/Z, [x15]\n"
+ "fmla z16.s, p2/M, z4.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x25, x16, LSL #2]\n"
+ "fmla z15.s, p2/M, z4.s, z18.s\n"
+ "fmla z31.s, p2/M, z4.s, z28.s\n"
+ "fmla z5.s, p2/M, z4.s, z26.s\n"
+ "ld1w { z23.s }, p2/Z, [x15, #1, MUL VL]\n"
+ "fmla z16.s, p2/M, z3.s, z18.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x16, LSL #2]\n"
+ "fmla z15.s, p2/M, z3.s, z9.s\n"
+ "fmla z31.s, p2/M, z3.s, z26.s\n"
+ "fmla z5.s, p2/M, z3.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x15, #2, MUL VL]\n"
+ "fmla z16.s, p2/M, z19.s, z9.s\n"
+ "ld1w { z21.s }, p3/Z, [x26, x16, LSL #2]\n"
+ "fmla z15.s, p2/M, z19.s, z2.s\n"
+ "fmla z31.s, p2/M, z19.s, z29.s\n"
+ "fmla z5.s, p2/M, z19.s, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x15, #3, MUL VL]\n"
+ "fmla z16.s, p2/M, z27.s, z1.s\n"
+ "ld1w { z19.s }, p3/Z, [x23, x16, LSL #2]\n"
+ "fmla z15.s, p2/M, z27.s, z0.s\n"
+ "fmla z31.s, p2/M, z27.s, z18.s\n"
+ "ld1w { z18.s }, p3/Z, [x22, x16, LSL #2]\n"
+ "fmla z5.s, p2/M, z27.s, z21.s\n"
+ "fmla z16.s, p2/M, z25.s, z0.s\n"
+ "fmla z15.s, p2/M, z25.s, z28.s\n"
+ "fmla z31.s, p2/M, z25.s, z21.s\n"
+ "ld1w { z21.s }, p3/Z, [x21, x16, LSL #2]\n"
"fmla z5.s, p2/M, z25.s, z19.s\n"
- "fmla z29.s, p2/M, z25.s, z18.s\n"
- "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
- "incw x28\n"
- "fmla z30.s, p2/M, z23.s, z0.s\n"
- "fmla z31.s, p2/M, z23.s, z27.s\n"
- "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
- "ldr x24, [x16, #0xf0]\n"
+ "fmla z16.s, p2/M, z23.s, z28.s\n"
+ "fmla z15.s, p2/M, z23.s, z26.s\n"
+ "fmla z31.s, p2/M, z23.s, z19.s\n"
+ "ld1w { z12.s }, p3/Z, [x20, x16, LSL #2]\n"
"fmla z5.s, p2/M, z23.s, z18.s\n"
- "fmla z29.s, p2/M, z23.s, z9.s\n"
- "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
- "mov p0.b, p3.b\n"
- "fmla z30.s, p2/M, z20.s, z27.s\n"
- "fmla z31.s, p2/M, z20.s, z24.s\n"
- "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
- "ldr x23, [x16, #0x100]\n"
- "fmla z5.s, p2/M, z20.s, z9.s\n"
- "fmla z29.s, p2/M, z20.s, z8.s\n"
- "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
- "fmla z30.s, p2/M, z16.s, z24.s\n"
- "fmla z31.s, p2/M, z16.s, z26.s\n"
- "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
- "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z16.s, z8.s\n"
- "fmla z29.s, p2/M, z16.s, z17.s\n"
- "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
- "ldr x22, [x16, #0x108]\n"
- "fmla z30.s, p2/M, z21.s, z22.s\n"
- "fmla z31.s, p2/M, z21.s, z19.s\n"
- "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
- "ldr x21, [x16, #0x110]\n"
- "fmla z5.s, p2/M, z21.s, z10.s\n"
- "fmla z29.s, p2/M, z21.s, z0.s\n"
- "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
- "fmla z30.s, p2/M, z4.s, z19.s\n"
- "fmla z31.s, p2/M, z4.s, z18.s\n"
- "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
- "ldr x20, [x16, #0x118]\n"
- "fmla z5.s, p2/M, z4.s, z0.s\n"
- "fmla z29.s, p2/M, z4.s, z26.s\n"
- "ld1w { z23.s }, p2/Z, [x9]\n"
- "fmla z30.s, p2/M, z6.s, z18.s\n"
- "fmla z31.s, p2/M, z6.s, z9.s\n"
- "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z6.s, z26.s\n"
- "fmla z29.s, p2/M, z6.s, z24.s\n"
- "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
- "fmla z30.s, p2/M, z11.s, z9.s\n"
- "fmla z31.s, p2/M, z11.s, z8.s\n"
- "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z11.s, z24.s\n"
- "fmla z29.s, p2/M, z11.s, z27.s\n"
- "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
- "fmla z30.s, p2/M, z16.s, z8.s\n"
- "fmla z31.s, p2/M, z16.s, z17.s\n"
- "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z16.s, z27.s\n"
- "fmla z29.s, p2/M, z16.s, z22.s\n"
- "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
- "fmla z30.s, p2/M, z25.s, z10.s\n"
- "fmla z31.s, p2/M, z25.s, z0.s\n"
- "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
- "fmla z5.s, p2/M, z25.s, z18.s\n"
- "fmla z29.s, p2/M, z25.s, z17.s\n"
- "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z23.s, z0.s\n"
- "fmla z31.s, p2/M, z23.s, z26.s\n"
- "fmla z5.s, p2/M, z23.s, z17.s\n"
- "fmla z29.s, p2/M, z23.s, z16.s\n"
- "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z21.s, z26.s\n"
- "fmla z31.s, p2/M, z21.s, z24.s\n"
- "fmla z5.s, p2/M, z21.s, z16.s\n"
- "fmla z29.s, p2/M, z21.s, z18.s\n"
- "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
- "fmla z30.s, p2/M, z20.s, z24.s\n"
- "fmla z31.s, p2/M, z20.s, z27.s\n"
- "fmla z5.s, p2/M, z20.s, z18.s\n"
- "fmla z29.s, p2/M, z20.s, z17.s\n"
- "fmla z30.s, p2/M, z19.s, z27.s\n"
- "fmla z31.s, p2/M, z19.s, z22.s\n"
- "fmax z30.s, p2/M, z30.s, z15.s\n"
- "fmax z31.s, p2/M, z31.s, z15.s\n"
- "fmla z5.s, p2/M, z19.s, z17.s\n"
- "fmla z29.s, p2/M, z19.s, z16.s\n"
- "fmax z5.s, p2/M, z5.s, z15.s\n"
- "fmax z29.s, p2/M, z29.s, z15.s\n"
- "fmin z30.s, p2/M, z30.s, z28.s\n"
- "fmin z31.s, p2/M, z31.s, z28.s\n"
- "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
- "fmin z5.s, p2/M, z5.s, z28.s\n"
- "fmin z29.s, p2/M, z29.s, z28.s\n"
- "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
- "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
- "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
+ "fmla z16.s, p2/M, z22.s, z26.s\n"
+ "fmla z15.s, p2/M, z22.s, z29.s\n"
+ "fmla z31.s, p2/M, z22.s, z18.s\n"
+ "fmla z5.s, p2/M, z22.s, z21.s\n"
+ "fmla z16.s, p2/M, z20.s, z29.s\n"
+ "fmla z15.s, p2/M, z20.s, z24.s\n"
+ "fmla z31.s, p2/M, z20.s, z21.s\n"
+ "fmla z5.s, p2/M, z20.s, z12.s\n"
+ "fmax z16.s, p2/M, z16.s, z17.s\n"
+ "fmax z15.s, p2/M, z15.s, z17.s\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "fmin z16.s, p2/M, z16.s, z30.s\n"
+ "fmin z15.s, p2/M, z15.s, z30.s\n"
+ "fmax z5.s, p2/M, z5.s, z17.s\n"
+ "fmin z31.s, p2/M, z31.s, z30.s\n"
+ "st1w { z16.s }, p0, [x13, x9, LSL #2]\n"
+ "fmin z5.s, p2/M, z5.s, z30.s\n"
+ "st1w { z15.s }, p0, [x12, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x11, x9, LSL #2]\n"
+ "st1w { z5.s }, p0, [x10, x9, LSL #2]\n"
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
index d53daaa8a0..b5e2ef92f7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,118 +45,118 @@ void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
__asm__ __volatile__(
"ptrue p1.b\n"
- "mov x11, #0x0\n"
+ "mov x9, #0x0\n"
"ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
"ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
+ "whilelt p0.s, x9, %x[n_channels]\n"
"1:" // Channel loop
"mov z23.b, #0x0\n"
"cbz %x[bias], 2f\n"
- "ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
"2:" // Channel loop: Load bias: Done
- "mov x10, %x[inptrs]\n"
- "ldp x28, x27, [x10], #0x10\n"
- "ldp x26, x25, [x10], #0x10\n"
- "subs x9, %x[n_points], #0x1\n"
- "ldp x24, x23, [x10], #0x10\n"
- "ldp x22, x21, [x10], #0x10\n"
+ "mov x25, %x[inptrs]\n"
+ "subs x24, %x[n_points], #0x1\n"
"mov z24.d, z23.d\n"
"mov z25.d, z23.d\n"
- "ldr x20, [x10], #0x8\n"
"mov z26.d, z23.d\n"
"mov z27.d, z23.d\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "addvl %x[params], %x[params], #1\n"
"mov z28.d, z23.d\n"
"mov z29.d, z23.d\n"
- "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
- "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ldp x23, x20, [x25], #0x10\n"
"mov z30.d, z23.d\n"
"mov z31.d, z23.d\n"
- "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
- "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
- "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
- "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
- "addvl %x[params], %x[params], #1\n"
- "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
- "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
- "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+ "ldp x22, x21, [x25], #0x10\n"
+ "ld1w { z14.s }, p0/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x20, x9, LSL #2]\n"
+ "ldp x23, x20, [x25], #0x10\n"
+ "ld1w { z16.s }, p0/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x21, x9, LSL #2]\n"
+ "ldp x22, x21, [x25], #0x10\n"
+ "ld1w { z18.s }, p0/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x25], #0x8\n"
+ "ld1w { z20.s }, p0/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x9, LSL #2]\n"
"ble 4f\n"
"3:" // Channel loop: Planar loop
- "ldp x28, x27, [x10], #0x10\n"
- "ldp x26, x25, [x10], #0x10\n"
- "subs x9, x9, #0x1\n"
+ "ldp x23, x20, [x25], #0x10\n"
+ "subs x24, x24, #0x1\n"
"fmla z23.s, p1/M, z14.s, z0.s\n"
- "ldp x24, x23, [x10], #0x10\n"
- "ldp x22, x21, [x10], #0x10\n"
"fmla z24.s, p1/M, z15.s, z0.s\n"
"fmla z25.s, p1/M, z16.s, z0.s\n"
- "ldr x20, [x10], #0x8\n"
"fmla z26.s, p1/M, z17.s, z0.s\n"
"fmla z27.s, p1/M, z18.s, z0.s\n"
- "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
"fmla z28.s, p1/M, z19.s, z0.s\n"
+ "ldp x22, x21, [x25], #0x10\n"
"fmla z29.s, p1/M, z20.s, z0.s\n"
- "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
+ "ld1w { z14.s }, p0/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x20, x9, LSL #2]\n"
"fmla z31.s, p1/M, z22.s, z0.s\n"
"ld1w { z0.s }, p1/Z, [%x[params]]\n"
- "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
- "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
- "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
"addvl %x[params], %x[params], #1\n"
- "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
- "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
- "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+ "ldp x23, x20, [x25], #0x10\n"
+ "ld1w { z16.s }, p0/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x21, x9, LSL #2]\n"
+ "ldp x22, x21, [x25], #0x10\n"
+ "ld1w { z18.s }, p0/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x25], #0x8\n"
+ "ld1w { z20.s }, p0/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x9, LSL #2]\n"
"bgt 3b\n"
"4:" // Channel loop: Planar tail
"fmla z23.s, p1/M, z14.s, z0.s\n"
"fmla z24.s, p1/M, z15.s, z0.s\n"
- "fmax z23.s, p1/M, z23.s, z2.s\n"
- "fmax z24.s, p1/M, z24.s, z2.s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
"fmla z25.s, p1/M, z16.s, z0.s\n"
"fmla z26.s, p1/M, z17.s, z0.s\n"
- "fmax z25.s, p1/M, z25.s, z2.s\n"
- "fmax z26.s, p1/M, z26.s, z2.s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmla z27.s, p1/M, z18.s, z0.s\n"
"fmla z28.s, p1/M, z19.s, z0.s\n"
- "fmax z27.s, p1/M, z27.s, z2.s\n"
- "fmax z28.s, p1/M, z28.s, z2.s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
"fmla z29.s, p1/M, z20.s, z0.s\n"
"fmla z30.s, p1/M, z21.s, z0.s\n"
+ "fmla z31.s, p1/M, z22.s, z0.s\n"
+ "fmax z23.s, p1/M, z23.s, z2.s\n"
+ "fmax z24.s, p1/M, z24.s, z2.s\n"
+ "fmax z25.s, p1/M, z25.s, z2.s\n"
+ "fmax z26.s, p1/M, z26.s, z2.s\n"
+ "fmax z27.s, p1/M, z27.s, z2.s\n"
+ "fmax z28.s, p1/M, z28.s, z2.s\n"
"fmax z29.s, p1/M, z29.s, z2.s\n"
"fmax z30.s, p1/M, z30.s, z2.s\n"
- "fmla z31.s, p1/M, z22.s, z0.s\n"
"fmax z31.s, p1/M, z31.s, z2.s\n"
- "ldp x28, x27, [%x[outptrs], #0x0]\n"
- "ldp x26, x25, [%x[outptrs], #0x10]\n"
- "ldp x24, x23, [%x[outptrs], #0x20]\n"
- "ldp x22, x21, [%x[outptrs], #0x30]\n"
"fmin z23.s, p1/M, z23.s, z1.s\n"
"fmin z24.s, p1/M, z24.s, z1.s\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
"fmin z25.s, p1/M, z25.s, z1.s\n"
"fmin z26.s, p1/M, z26.s, z1.s\n"
- "st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
"fmin z27.s, p1/M, z27.s, z1.s\n"
"fmin z28.s, p1/M, z28.s, z1.s\n"
- "st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
"fmin z29.s, p1/M, z29.s, z1.s\n"
+ "st1w { z23.s }, p0, [x28, x9, LSL #2]\n"
"fmin z30.s, p1/M, z30.s, z1.s\n"
- "st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
"fmin z31.s, p1/M, z31.s, z1.s\n"
- "st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
- "st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
- "st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
- "st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
- "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
- "st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
- "incw x11\n"
- "whilelt p0.s, x11, %x[n_channels]\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+ "incw x9\n"
+ "whilelt p0.s, x9, %x[n_channels]\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
index 3a71baaf61..4676465037 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,49 +43,49 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
__asm__ __volatile__(
"mov x17, #0x0\n"
- "whilelt p2.s, x17, %x[channel_multiplier]\n"
+ "ptrue p2.b\n"
"ldr x16, [%x[inptrs], #0x0]\n"
"ldr x15, [%x[inptrs], #0x8]\n"
- "ptrue p1.b\n"
"ldr x14, [%x[inptrs], #0x10]\n"
"ldr x13, [%x[inptrs], #0x18]\n"
"mov x12, #0x0\n"
"ldr x11, [%x[inptrs], #0x20]\n"
"ldr x10, [%x[inptrs], #0x28]\n"
+ "whilelt p1.s, x17, %x[channel_multiplier]\n"
"ldr x9, [%x[inptrs], #0x30]\n"
- "ld1w { z24.s }, p2/Z, [%x[params]]\n"
- "mov z21.d, z24.d\n"
- "mov z25.d, z24.d\n"
"ldp x28, x27, [%x[outptrs], #0x0]\n"
"ldp x26, x25, [%x[outptrs], #0x10]\n"
- "mov z27.d, z24.d\n"
- "mov z26.d, z24.d\n"
"ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ld1w { z24.s }, p1/Z, [%x[params]]\n"
"ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "ld1rqw { z2.s }, p2/Z, [x16]\n"
+ "ld1rqw { z3.s }, p2/Z, [x16, #16]\n"
+ "ld1rqw { z4.s }, p2/Z, [x15]\n"
+ "ld1rqw { z5.s }, p2/Z, [x15, #16]\n"
+ "ld1rqw { z6.s }, p2/Z, [x14]\n"
+ "mov z21.d, z24.d\n"
+ "mov z25.d, z24.d\n"
+ "ld1rqw { z7.s }, p2/Z, [x14, #16]\n"
+ "ld1rqw { z8.s }, p2/Z, [x13]\n"
+ "mov z27.d, z24.d\n"
+ "mov z26.d, z24.d\n"
+ "ld1rqw { z9.s }, p2/Z, [x13, #16]\n"
+ "ld1rqw { z10.s }, p2/Z, [x11]\n"
"mov z28.d, z24.d\n"
+ "mov z16.d, z24.d\n"
+ "ld1rqw { z11.s }, p2/Z, [x11, #16]\n"
+ "ld1rqw { z12.s }, p2/Z, [x10]\n"
+ "mov z22.d, z24.d\n"
"mov z20.d, z24.d\n"
- "ldr x20, [%x[outptrs], #0x40]\n"
- "ld1rqw { z2.s }, p1/Z, [x16]\n"
- "mov z23.d, z24.d\n"
- "mov z19.d, z24.d\n"
- "ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
- "ld1rqw { z4.s }, p1/Z, [x15]\n"
- "ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
- "ld1rqw { z6.s }, p1/Z, [x14]\n"
- "ld1rqw { z7.s }, p1/Z, [x14, #16]\n"
- "ld1rqw { z8.s }, p1/Z, [x13]\n"
- "ld1rqw { z9.s }, p1/Z, [x13, #16]\n"
- "ld1rqw { z10.s }, p1/Z, [x11]\n"
- "ld1rqw { z11.s }, p1/Z, [x11, #16]\n"
- "ld1rqw { z12.s }, p1/Z, [x10]\n"
- "ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
- "ld1rqw { z14.s }, p1/Z, [x9]\n"
- "ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
- "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
- "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "ld1rqw { z13.s }, p2/Z, [x10, #16]\n"
+ "ld1rqw { z14.s }, p2/Z, [x9]\n"
+ "ld1rqw { z15.s }, p2/Z, [x9, #16]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[clamps]]\n"
+ "ld1rw { z19.s }, p2/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
"1:" // Output channel complete vector loop
"fmla z24.s, z31.s, z2.s[0]\n"
@@ -95,37 +95,37 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"fmla z26.s, z31.s, z6.s[2]\n"
"fmla z28.s, z31.s, z7.s[0]\n"
"mov z0.d, z11.d\n"
- "mov p0.b, p2.b\n"
+ "mov p0.b, p1.b\n"
"fmla z21.s, z31.s, z2.s[2]\n"
"fmla z25.s, z31.s, z3.s[0]\n"
- "whilelt p2.s, x17, %x[channel_multiplier]\n"
- "fmla z20.s, z31.s, z1.s[0]\n"
- "fmla z23.s, z31.s, z1.s[2]\n"
- "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z16.s, z31.s, z1.s[0]\n"
+ "fmla z22.s, z31.s, z1.s[2]\n"
+ "whilelt p1.s, x17, %x[channel_multiplier]\n"
+ "fmla z20.s, z31.s, z0.s[0]\n"
"fmla z24.s, z30.s, z2.s[1]\n"
- "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+ "ld1w { z18.s }, p2/Z, [%x[params]]\n"
"fmla z27.s, z30.s, z6.s[1]\n"
"fmla z26.s, z30.s, z6.s[3]\n"
"fmla z28.s, z30.s, z7.s[1]\n"
"fmla z21.s, z30.s, z2.s[3]\n"
"fmla z25.s, z30.s, z3.s[1]\n"
- "fmla z20.s, z30.s, z1.s[1]\n"
- "fmla z23.s, z30.s, z1.s[3]\n"
- "fmla z19.s, z30.s, z0.s[1]\n"
- "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z16.s, z30.s, z1.s[1]\n"
+ "fmla z22.s, z30.s, z1.s[3]\n"
+ "fmla z20.s, z30.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"fmla z24.s, z29.s, z2.s[2]\n"
"fmla z27.s, z29.s, z6.s[2]\n"
"fmla z26.s, z29.s, z7.s[0]\n"
"fmla z28.s, z29.s, z7.s[2]\n"
"fmla z21.s, z29.s, z3.s[0]\n"
"fmla z25.s, z29.s, z3.s[2]\n"
- "fmla z20.s, z29.s, z1.s[2]\n"
- "fmla z23.s, z29.s, z0.s[0]\n"
+ "fmla z16.s, z29.s, z1.s[2]\n"
+ "fmla z22.s, z29.s, z0.s[0]\n"
"mov z1.d, z8.d\n"
- "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmla z20.s, z29.s, z0.s[2]\n"
"mov z0.d, z9.d\n"
"fmla z24.s, z18.s, z4.s[0]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"fmla z27.s, z18.s, z1.s[0]\n"
"fmla z26.s, z18.s, z1.s[2]\n"
"mov z1.d, z12.d\n"
@@ -133,40 +133,40 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"mov z0.d, z13.d\n"
"fmla z21.s, z18.s, z4.s[2]\n"
"fmla z25.s, z18.s, z5.s[0]\n"
- "fmla z20.s, z18.s, z1.s[0]\n"
- "fmla z23.s, z18.s, z1.s[2]\n"
- "fmla z19.s, z18.s, z0.s[0]\n"
+ "fmla z16.s, z18.s, z1.s[0]\n"
+ "fmla z22.s, z18.s, z1.s[2]\n"
+ "fmla z20.s, z18.s, z0.s[0]\n"
"mov z1.d, z8.d\n"
- "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[params], #3, MUL VL]\n"
"mov z0.d, z9.d\n"
"fmla z24.s, z17.s, z4.s[1]\n"
"fmla z27.s, z17.s, z1.s[1]\n"
"fmla z26.s, z17.s, z1.s[3]\n"
- "fmla z28.s, z17.s, z0.s[1]\n"
"mov z1.d, z12.d\n"
- "mov z0.d, z13.d\n"
"fmla z21.s, z17.s, z4.s[3]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z13.d\n"
"fmla z25.s, z17.s, z5.s[1]\n"
- "fmla z20.s, z17.s, z1.s[1]\n"
- "fmla z23.s, z17.s, z1.s[3]\n"
+ "fmla z16.s, z17.s, z1.s[1]\n"
+ "fmla z22.s, z17.s, z1.s[3]\n"
"mov z1.d, z8.d\n"
- "fmla z19.s, z17.s, z0.s[1]\n"
+ "fmla z20.s, z17.s, z0.s[1]\n"
"mov z0.d, z9.d\n"
"fmla z24.s, z31.s, z4.s[2]\n"
- "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "ld1w { z17.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"fmla z27.s, z31.s, z1.s[2]\n"
- "fmla z26.s, z31.s, z0.s[0]\n"
"mov z1.d, z12.d\n"
+ "fmla z21.s, z31.s, z5.s[0]\n"
+ "fmla z26.s, z31.s, z0.s[0]\n"
"fmla z28.s, z31.s, z0.s[2]\n"
"mov z0.d, z13.d\n"
- "fmla z21.s, z31.s, z5.s[0]\n"
"fmla z25.s, z31.s, z5.s[2]\n"
- "fmla z20.s, z31.s, z1.s[2]\n"
+ "fmla z16.s, z31.s, z1.s[2]\n"
"mov z1.d, z10.d\n"
- "fmla z23.s, z31.s, z0.s[0]\n"
- "fmla z19.s, z31.s, z0.s[2]\n"
+ "fmla z22.s, z31.s, z0.s[0]\n"
+ "fmla z20.s, z31.s, z0.s[2]\n"
"mov z0.d, z11.d\n"
- "ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #5, MUL VL]\n"
"fmla z24.s, z18.s, z6.s[0]\n"
"fmla z27.s, z18.s, z1.s[0]\n"
"fmla z26.s, z18.s, z1.s[2]\n"
@@ -175,13 +175,13 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"mov z0.d, z15.d\n"
"fmla z21.s, z18.s, z6.s[2]\n"
"fmla z25.s, z18.s, z7.s[0]\n"
- "fmla z20.s, z18.s, z1.s[0]\n"
- "fmla z23.s, z18.s, z1.s[2]\n"
+ "fmla z16.s, z18.s, z1.s[0]\n"
+ "fmla z22.s, z18.s, z1.s[2]\n"
"mov z1.d, z10.d\n"
- "fmla z19.s, z18.s, z0.s[0]\n"
+ "fmla z20.s, z18.s, z0.s[0]\n"
"mov z0.d, z11.d\n"
"fmla z24.s, z17.s, z6.s[1]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #7, MUL VL]\n"
"fmla z27.s, z17.s, z1.s[1]\n"
"fmla z26.s, z17.s, z1.s[3]\n"
"mov z1.d, z14.d\n"
@@ -189,63 +189,63 @@ void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_imp
"mov z0.d, z15.d\n"
"fmla z21.s, z17.s, z6.s[3]\n"
"fmla z25.s, z17.s, z7.s[1]\n"
- "fmla z20.s, z17.s, z1.s[1]\n"
- "fmla z23.s, z17.s, z1.s[3]\n"
- "fmla z19.s, z17.s, z0.s[1]\n"
+ "fmla z16.s, z17.s, z1.s[1]\n"
+ "fmla z22.s, z17.s, z1.s[3]\n"
+ "fmla z20.s, z17.s, z0.s[1]\n"
"mov z1.d, z10.d\n"
"mov z0.d, z11.d\n"
"fmla z24.s, z29.s, z6.s[2]\n"
"fmla z27.s, z29.s, z1.s[2]\n"
- "fmin z24.s, p1/M, z24.s, z16.s\n"
+ "mov z1.d, z14.d\n"
+ "fmla z21.s, z29.s, z7.s[0]\n"
"fmla z26.s, z29.s, z0.s[0]\n"
"fmla z28.s, z29.s, z0.s[2]\n"
- "mov z1.d, z14.d\n"
- "fmax z24.s, p1/M, z24.s, z22.s\n"
"mov z0.d, z15.d\n"
- "fmla z21.s, z29.s, z7.s[0]\n"
"fmla z25.s, z29.s, z7.s[2]\n"
- "fmin z21.s, p1/M, z21.s, z16.s\n"
- "fmla z20.s, z29.s, z1.s[2]\n"
- "fmla z23.s, z29.s, z0.s[0]\n"
- "fmin z25.s, p1/M, z25.s, z16.s\n"
- "fmin z27.s, p1/M, z27.s, z16.s\n"
- "fmla z19.s, z29.s, z0.s[2]\n"
- "fmin z26.s, p1/M, z26.s, z16.s\n"
- "fmin z28.s, p1/M, z28.s, z16.s\n"
+ "fmla z16.s, z29.s, z1.s[2]\n"
+ "fmla z22.s, z29.s, z0.s[0]\n"
+ "fmla z20.s, z29.s, z0.s[2]\n"
+ "fmin z24.s, p2/M, z24.s, z19.s\n"
+ "fmin z27.s, p2/M, z27.s, z19.s\n"
+ "fmin z21.s, p2/M, z21.s, z19.s\n"
+ "fmin z26.s, p2/M, z26.s, z19.s\n"
+ "fmin z25.s, p2/M, z25.s, z19.s\n"
+ "fmin z28.s, p2/M, z28.s, z19.s\n"
+ "fmax z24.s, p2/M, z24.s, z23.s\n"
+ "fmin z16.s, p2/M, z16.s, z19.s\n"
+ "fmin z22.s, p2/M, z22.s, z19.s\n"
+ "fmin z20.s, p2/M, z20.s, z19.s\n"
+ "fmax z21.s, p2/M, z21.s, z23.s\n"
+ "fmax z25.s, p2/M, z25.s, z23.s\n"
"st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
- "fmin z20.s, p1/M, z20.s, z16.s\n"
- "fmin z23.s, p1/M, z23.s, z16.s\n"
- "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "fmin z19.s, p1/M, z19.s, z16.s\n"
+ "ld1w { z24.s }, p1/Z, [%x[params], #6, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "fmax z21.s, p1/M, z21.s, z22.s\n"
- "fmax z25.s, p1/M, z25.s, z22.s\n"
+ "fmax z27.s, p2/M, z27.s, z23.s\n"
+ "fmax z26.s, p2/M, z26.s, z23.s\n"
+ "fmax z28.s, p2/M, z28.s, z23.s\n"
+ "fmax z16.s, p2/M, z16.s, z23.s\n"
+ "fmax z22.s, p2/M, z22.s, z23.s\n"
"st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
"mov z21.d, z24.d\n"
- "fmax z27.s, p1/M, z27.s, z22.s\n"
- "fmax z26.s, p1/M, z26.s, z22.s\n"
+ "fmax z20.s, p2/M, z20.s, z23.s\n"
"st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
"mov z25.d, z24.d\n"
- "fmax z28.s, p1/M, z28.s, z22.s\n"
- "fmax z20.s, p1/M, z20.s, z22.s\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
"st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
"mov z27.d, z24.d\n"
- "fmax z23.s, p1/M, z23.s, z22.s\n"
- "fmax z19.s, p1/M, z19.s, z22.s\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
"st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
"mov z26.d, z24.d\n"
"st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
"mov z28.d, z24.d\n"
- "addvl %x[params], %x[params], #-6\n"
- "st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
- "mov z20.d, z24.d\n"
- "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
- "mov z23.d, z24.d\n"
- "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
+ "st1w { z16.s }, p0, [x22, x12, LSL #2]\n"
+ "mov z16.d, z24.d\n"
+ "st1w { z22.s }, p0, [x21, x12, LSL #2]\n"
+ "mov z22.d, z24.d\n"
+ "st1w { z20.s }, p0, [x20, x12, LSL #2]\n"
"incw x12\n"
- "mov z19.d, z24.d\n"
+ "mov z20.d, z24.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
index 84ab4b5035..292fd70fba 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,46 +43,46 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
__asm__ __volatile__(
"mov x15, #0x0\n"
- "whilelt p2.s, x15, %x[channel_multiplier]\n"
+ "ptrue p2.b\n"
"ldr x14, [%x[inptrs], #0x0]\n"
"ldr x13, [%x[inptrs], #0x8]\n"
- "ptrue p1.b\n"
"ldr x12, [%x[inptrs], #0x10]\n"
"ldr x11, [%x[inptrs], #0x18]\n"
"mov x10, #0x0\n"
"ldr x9, [%x[inptrs], #0x20]\n"
"ldr x28, [%x[inptrs], #0x28]\n"
- "ld1w { z16.s }, p2/Z, [%x[params]]\n"
+ "whilelt p1.s, x15, %x[channel_multiplier]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "mov z25.d, z16.d\n"
- "mov z15.d, z16.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "ld1w { z16.s }, p1/Z, [%x[params]]\n"
+ "ld1rqw { z2.s }, p2/Z, [x14]\n"
+ "ld1rqw { z3.s }, p2/Z, [x14, #16]\n"
+ "ld1rqw { z4.s }, p2/Z, [x13]\n"
+ "ld1rqw { z5.s }, p2/Z, [x13, #16]\n"
+ "ld1rqw { z6.s }, p2/Z, [x12]\n"
+ "ld1rqw { z7.s }, p2/Z, [x12, #16]\n"
+ "ld1rqw { z8.s }, p2/Z, [x11]\n"
+ "mov z25.d, z16.d\n"
+ "mov z15.d, z16.d\n"
+ "ld1rqw { z9.s }, p2/Z, [x11, #16]\n"
+ "ld1rqw { z10.s }, p2/Z, [x9]\n"
"mov z24.d, z16.d\n"
"mov z14.d, z16.d\n"
- "ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rqw { z2.s }, p1/Z, [x14]\n"
+ "ld1rqw { z11.s }, p2/Z, [x9, #16]\n"
+ "ld1rqw { z12.s }, p2/Z, [x28]\n"
"mov z26.d, z16.d\n"
"mov z17.d, z16.d\n"
- "ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
- "ld1rqw { z4.s }, p1/Z, [x13]\n"
- "mov z23.d, z16.d\n"
- "ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
- "ld1rqw { z6.s }, p1/Z, [x12]\n"
- "ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
- "ld1rqw { z8.s }, p1/Z, [x11]\n"
- "ld1rqw { z9.s }, p1/Z, [x11, #16]\n"
- "ld1rqw { z10.s }, p1/Z, [x9]\n"
- "ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
- "ld1rqw { z12.s }, p1/Z, [x28]\n"
- "ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
- "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
- "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1rqw { z13.s }, p2/Z, [x28, #16]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[clamps]]\n"
+ "mov z21.d, z16.d\n"
+ "ld1rw { z22.s }, p2/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #6\n"
"1:" // Output channel complete vector loop
"fmla z16.s, z31.s, z2.s[0]\n"
@@ -92,13 +92,13 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z15.s, z31.s, z2.s[2]\n"
"fmla z24.s, z31.s, z2.s[3]\n"
"mov z1.d, z9.d\n"
- "mov p0.b, p2.b\n"
+ "mov p0.b, p1.b\n"
"fmla z14.s, z31.s, z4.s[0]\n"
"fmla z26.s, z31.s, z4.s[1]\n"
- "whilelt p2.s, x15, %x[channel_multiplier]\n"
"fmla z17.s, z31.s, z4.s[2]\n"
- "fmla z23.s, z31.s, z4.s[3]\n"
- "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+ "fmla z21.s, z31.s, z4.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params]]\n"
+ "whilelt p1.s, x15, %x[channel_multiplier]\n"
"fmla z16.s, z30.s, z2.s[1]\n"
"fmla z25.s, z30.s, z2.s[2]\n"
"fmla z15.s, z30.s, z2.s[3]\n"
@@ -106,8 +106,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z30.s, z4.s[1]\n"
"fmla z26.s, z30.s, z4.s[2]\n"
"fmla z17.s, z30.s, z4.s[3]\n"
- "fmla z23.s, z30.s, z5.s[0]\n"
- "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z21.s, z30.s, z5.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"fmla z16.s, z29.s, z2.s[2]\n"
"fmla z25.s, z29.s, z2.s[3]\n"
"fmla z15.s, z29.s, z3.s[0]\n"
@@ -115,8 +115,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z29.s, z4.s[2]\n"
"fmla z26.s, z29.s, z4.s[3]\n"
"fmla z17.s, z29.s, z5.s[0]\n"
- "fmla z23.s, z29.s, z5.s[1]\n"
- "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z21.s, z29.s, z5.s[1]\n"
+ "ld1w { z18.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"fmla z16.s, z28.s, z2.s[3]\n"
"fmla z25.s, z28.s, z3.s[0]\n"
"fmla z15.s, z28.s, z3.s[1]\n"
@@ -124,8 +124,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z28.s, z4.s[3]\n"
"fmla z26.s, z28.s, z5.s[0]\n"
"fmla z17.s, z28.s, z5.s[1]\n"
- "fmla z23.s, z28.s, z5.s[2]\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z21.s, z28.s, z5.s[2]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #3, MUL VL]\n"
"fmla z16.s, z27.s, z3.s[0]\n"
"fmla z25.s, z27.s, z3.s[1]\n"
"fmla z15.s, z27.s, z3.s[2]\n"
@@ -133,8 +133,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z27.s, z5.s[0]\n"
"fmla z26.s, z27.s, z5.s[1]\n"
"fmla z17.s, z27.s, z5.s[2]\n"
- "fmla z23.s, z27.s, z5.s[3]\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z21.s, z27.s, z5.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"fmla z16.s, z20.s, z4.s[0]\n"
"fmla z25.s, z20.s, z4.s[1]\n"
"fmla z15.s, z20.s, z4.s[2]\n"
@@ -142,8 +142,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z20.s, z6.s[0]\n"
"fmla z26.s, z20.s, z6.s[1]\n"
"fmla z17.s, z20.s, z6.s[2]\n"
- "fmla z23.s, z20.s, z6.s[3]\n"
- "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z21.s, z20.s, z6.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #5, MUL VL]\n"
"fmla z16.s, z19.s, z4.s[1]\n"
"fmla z25.s, z19.s, z4.s[2]\n"
"fmla z15.s, z19.s, z4.s[3]\n"
@@ -151,8 +151,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z19.s, z6.s[1]\n"
"fmla z26.s, z19.s, z6.s[2]\n"
"fmla z17.s, z19.s, z6.s[3]\n"
- "fmla z23.s, z19.s, z7.s[0]\n"
- "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z21.s, z19.s, z7.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
"fmla z16.s, z18.s, z4.s[2]\n"
"fmla z25.s, z18.s, z4.s[3]\n"
"fmla z15.s, z18.s, z5.s[0]\n"
@@ -160,18 +160,18 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z18.s, z6.s[2]\n"
"fmla z26.s, z18.s, z6.s[3]\n"
"fmla z17.s, z18.s, z7.s[0]\n"
- "fmla z23.s, z18.s, z7.s[1]\n"
- "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z21.s, z18.s, z7.s[1]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmla z16.s, z28.s, z4.s[3]\n"
- "fmla z25.s, z28.s, z5.s[0]\n"
- "fmla z15.s, z28.s, z5.s[1]\n"
- "fmla z24.s, z28.s, z5.s[2]\n"
- "fmla z14.s, z28.s, z6.s[3]\n"
- "fmla z26.s, z28.s, z7.s[0]\n"
- "fmla z17.s, z28.s, z7.s[1]\n"
- "fmla z23.s, z28.s, z7.s[2]\n"
- "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z16.s, z30.s, z4.s[3]\n"
+ "fmla z25.s, z30.s, z5.s[0]\n"
+ "fmla z15.s, z30.s, z5.s[1]\n"
+ "fmla z24.s, z30.s, z5.s[2]\n"
+ "fmla z14.s, z30.s, z6.s[3]\n"
+ "fmla z26.s, z30.s, z7.s[0]\n"
+ "fmla z17.s, z30.s, z7.s[1]\n"
+ "fmla z21.s, z30.s, z7.s[2]\n"
+ "ld1w { z18.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
"fmla z16.s, z27.s, z5.s[0]\n"
"fmla z25.s, z27.s, z5.s[1]\n"
"fmla z15.s, z27.s, z5.s[2]\n"
@@ -179,8 +179,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z27.s, z7.s[0]\n"
"fmla z26.s, z27.s, z7.s[1]\n"
"fmla z17.s, z27.s, z7.s[2]\n"
- "fmla z23.s, z27.s, z7.s[3]\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "fmla z21.s, z27.s, z7.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
"fmla z16.s, z20.s, z6.s[0]\n"
"fmla z25.s, z20.s, z6.s[1]\n"
"fmla z15.s, z20.s, z6.s[2]\n"
@@ -188,8 +188,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z20.s, z0.s[0]\n"
"fmla z26.s, z20.s, z0.s[1]\n"
"fmla z17.s, z20.s, z0.s[2]\n"
- "fmla z23.s, z20.s, z0.s[3]\n"
- "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z21.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #-6, MUL VL]\n"
"fmla z16.s, z19.s, z6.s[1]\n"
"fmla z25.s, z19.s, z6.s[2]\n"
"fmla z15.s, z19.s, z6.s[3]\n"
@@ -197,26 +197,26 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z19.s, z0.s[1]\n"
"fmla z26.s, z19.s, z0.s[2]\n"
"fmla z17.s, z19.s, z0.s[3]\n"
- "fmla z23.s, z19.s, z1.s[0]\n"
- "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
- "fmla z16.s, z18.s, z6.s[2]\n"
- "fmla z25.s, z18.s, z6.s[3]\n"
- "fmla z15.s, z18.s, z7.s[0]\n"
- "fmla z24.s, z18.s, z7.s[1]\n"
- "fmla z14.s, z18.s, z0.s[2]\n"
- "fmla z26.s, z18.s, z0.s[3]\n"
- "fmla z17.s, z18.s, z1.s[0]\n"
- "fmla z23.s, z18.s, z1.s[1]\n"
- "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
- "fmla z16.s, z30.s, z6.s[3]\n"
- "fmla z25.s, z30.s, z7.s[0]\n"
- "fmla z15.s, z30.s, z7.s[1]\n"
- "fmla z24.s, z30.s, z7.s[2]\n"
- "fmla z14.s, z30.s, z0.s[3]\n"
- "fmla z26.s, z30.s, z1.s[0]\n"
- "fmla z17.s, z30.s, z1.s[1]\n"
- "fmla z23.s, z30.s, z1.s[2]\n"
- "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z21.s, z19.s, z1.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z16.s, z29.s, z6.s[2]\n"
+ "fmla z25.s, z29.s, z6.s[3]\n"
+ "fmla z15.s, z29.s, z7.s[0]\n"
+ "fmla z24.s, z29.s, z7.s[1]\n"
+ "fmla z14.s, z29.s, z0.s[2]\n"
+ "fmla z26.s, z29.s, z0.s[3]\n"
+ "fmla z17.s, z29.s, z1.s[0]\n"
+ "fmla z21.s, z29.s, z1.s[1]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z16.s, z18.s, z6.s[3]\n"
+ "fmla z25.s, z18.s, z7.s[0]\n"
+ "fmla z15.s, z18.s, z7.s[1]\n"
+ "fmla z24.s, z18.s, z7.s[2]\n"
+ "fmla z14.s, z18.s, z0.s[3]\n"
+ "fmla z26.s, z18.s, z1.s[0]\n"
+ "fmla z17.s, z18.s, z1.s[1]\n"
+ "fmla z21.s, z18.s, z1.s[2]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
"fmla z16.s, z27.s, z7.s[0]\n"
"fmla z25.s, z27.s, z7.s[1]\n"
"fmla z15.s, z27.s, z7.s[2]\n"
@@ -224,8 +224,8 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z27.s, z1.s[0]\n"
"fmla z26.s, z27.s, z1.s[1]\n"
"fmla z17.s, z27.s, z1.s[2]\n"
- "fmla z23.s, z27.s, z1.s[3]\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "fmla z21.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
"fmla z16.s, z20.s, z0.s[0]\n"
"fmla z25.s, z20.s, z0.s[1]\n"
"fmla z15.s, z20.s, z0.s[2]\n"
@@ -234,47 +234,47 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z20.s, z0.s[0]\n"
"fmla z26.s, z20.s, z0.s[1]\n"
"fmla z17.s, z20.s, z0.s[2]\n"
- "fmla z23.s, z20.s, z0.s[3]\n"
+ "fmla z21.s, z20.s, z0.s[3]\n"
"mov z0.d, z8.d\n"
- "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #-1, MUL VL]\n"
"fmla z16.s, z19.s, z0.s[1]\n"
"fmla z25.s, z19.s, z0.s[2]\n"
"fmla z15.s, z19.s, z0.s[3]\n"
"fmla z24.s, z19.s, z1.s[0]\n"
- "mov z1.d, z10.d\n"
- "mov z0.d, z11.d\n"
- "fmla z14.s, z19.s, z1.s[1]\n"
- "fmla z26.s, z19.s, z1.s[2]\n"
- "fmla z17.s, z19.s, z1.s[3]\n"
- "fmla z23.s, z19.s, z0.s[0]\n"
- "mov z1.d, z8.d\n"
- "ld1w { z19.s }, p1/Z, [%x[params]]\n"
- "mov z0.d, z9.d\n"
- "fmla z16.s, z18.s, z1.s[2]\n"
- "fmla z25.s, z18.s, z1.s[3]\n"
- "fmla z15.s, z18.s, z0.s[0]\n"
- "fmla z24.s, z18.s, z0.s[1]\n"
- "mov z1.d, z10.d\n"
- "mov z0.d, z11.d\n"
- "fmla z14.s, z18.s, z1.s[2]\n"
- "fmla z26.s, z18.s, z1.s[3]\n"
- "fmla z17.s, z18.s, z0.s[0]\n"
- "fmla z23.s, z18.s, z0.s[1]\n"
- "mov z1.d, z8.d\n"
- "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
- "mov z0.d, z9.d\n"
- "fmla z16.s, z31.s, z1.s[3]\n"
- "fmla z25.s, z31.s, z0.s[0]\n"
- "fmla z15.s, z31.s, z0.s[1]\n"
- "fmla z24.s, z31.s, z0.s[2]\n"
"mov z0.d, z10.d\n"
"mov z1.d, z11.d\n"
- "fmla z14.s, z31.s, z0.s[3]\n"
- "fmla z26.s, z31.s, z1.s[0]\n"
- "fmla z17.s, z31.s, z1.s[1]\n"
- "fmla z23.s, z31.s, z1.s[2]\n"
+ "fmla z14.s, z19.s, z0.s[1]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "fmla z17.s, z19.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z21.s, z19.s, z1.s[0]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params]]\n"
+ "mov z1.d, z9.d\n"
+ "fmla z16.s, z30.s, z0.s[2]\n"
+ "fmla z25.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z15.s, z30.s, z1.s[0]\n"
+ "fmla z24.s, z30.s, z1.s[1]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z14.s, z30.s, z0.s[2]\n"
+ "fmla z26.s, z30.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "fmla z17.s, z30.s, z1.s[0]\n"
+ "fmla z21.s, z30.s, z1.s[1]\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"mov z1.d, z9.d\n"
- "ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z16.s, z28.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z25.s, z28.s, z1.s[0]\n"
+ "fmla z15.s, z28.s, z1.s[1]\n"
+ "fmla z24.s, z28.s, z1.s[2]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z14.s, z28.s, z0.s[3]\n"
+ "fmla z26.s, z28.s, z1.s[0]\n"
+ "fmla z17.s, z28.s, z1.s[1]\n"
+ "fmla z21.s, z28.s, z1.s[2]\n"
+ "mov z1.d, z9.d\n"
+ "ld1w { z18.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"fmla z16.s, z27.s, z1.s[0]\n"
"fmla z25.s, z27.s, z1.s[1]\n"
"fmla z15.s, z27.s, z1.s[2]\n"
@@ -283,102 +283,102 @@ void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_imp
"fmla z14.s, z27.s, z1.s[0]\n"
"fmla z26.s, z27.s, z1.s[1]\n"
"fmla z17.s, z27.s, z1.s[2]\n"
- "fmla z23.s, z27.s, z1.s[3]\n"
- "ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
- "fmla z16.s, z20.s, z0.s[0]\n"
- "fmla z25.s, z20.s, z0.s[1]\n"
- "fmla z15.s, z20.s, z0.s[2]\n"
- "fmla z24.s, z20.s, z0.s[3]\n"
+ "fmla z21.s, z27.s, z1.s[3]\n"
+ "fmla z16.s, z31.s, z0.s[0]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z25.s, z31.s, z0.s[1]\n"
+ "fmla z15.s, z31.s, z0.s[2]\n"
+ "fmla z24.s, z31.s, z0.s[3]\n"
"mov z0.d, z12.d\n"
- "fmla z14.s, z20.s, z0.s[0]\n"
- "fmla z26.s, z20.s, z0.s[1]\n"
- "fmla z17.s, z20.s, z0.s[2]\n"
- "fmla z23.s, z20.s, z0.s[3]\n"
+ "fmla z14.s, z31.s, z0.s[0]\n"
+ "fmla z26.s, z31.s, z0.s[1]\n"
+ "fmla z17.s, z31.s, z0.s[2]\n"
+ "fmla z21.s, z31.s, z0.s[3]\n"
"mov z0.d, z10.d\n"
- "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
- "fmla z16.s, z19.s, z0.s[1]\n"
- "fmla z25.s, z19.s, z0.s[2]\n"
- "fmla z15.s, z19.s, z0.s[3]\n"
- "fmla z24.s, z19.s, z1.s[0]\n"
- "mov z1.d, z12.d\n"
- "mov z0.d, z13.d\n"
- "fmla z14.s, z19.s, z1.s[1]\n"
- "fmla z26.s, z19.s, z1.s[2]\n"
- "fmla z17.s, z19.s, z1.s[3]\n"
- "fmla z23.s, z19.s, z0.s[0]\n"
- "mov z1.d, z10.d\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "mov z0.d, z11.d\n"
- "fmla z16.s, z18.s, z1.s[2]\n"
- "fmla z25.s, z18.s, z1.s[3]\n"
- "fmla z15.s, z18.s, z0.s[0]\n"
- "fmla z24.s, z18.s, z0.s[1]\n"
- "mov z1.d, z12.d\n"
- "mov z0.d, z13.d\n"
- "fmla z14.s, z18.s, z1.s[2]\n"
- "fmla z26.s, z18.s, z1.s[3]\n"
- "fmla z17.s, z18.s, z0.s[0]\n"
- "fmla z23.s, z18.s, z0.s[1]\n"
- "mov z1.d, z10.d\n"
- "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "mov z0.d, z11.d\n"
- "fmla z16.s, z28.s, z1.s[3]\n"
- "fmla z25.s, z28.s, z0.s[0]\n"
- "fmla z15.s, z28.s, z0.s[1]\n"
- "fmla z24.s, z28.s, z0.s[2]\n"
- "mov z0.d, z13.d\n"
- "mov z1.d, z12.d\n"
- "fmla z26.s, z28.s, z0.s[0]\n"
- "fmla z17.s, z28.s, z0.s[1]\n"
- "fmla z23.s, z28.s, z0.s[2]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z16.s, z20.s, z0.s[1]\n"
+ "fmla z25.s, z20.s, z0.s[2]\n"
+ "fmla z15.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z24.s, z20.s, z1.s[0]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z14.s, z20.s, z0.s[1]\n"
+ "fmla z26.s, z20.s, z0.s[2]\n"
+ "fmla z17.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z21.s, z20.s, z1.s[0]\n"
+ "mov z1.d, z11.d\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z16.s, z19.s, z0.s[2]\n"
+ "fmla z25.s, z19.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z15.s, z19.s, z1.s[0]\n"
+ "fmla z24.s, z19.s, z1.s[1]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z14.s, z19.s, z0.s[2]\n"
+ "fmla z26.s, z19.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z17.s, z19.s, z1.s[0]\n"
+ "fmla z21.s, z19.s, z1.s[1]\n"
+ "mov z1.d, z11.d\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z16.s, z18.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z25.s, z18.s, z1.s[0]\n"
+ "fmla z15.s, z18.s, z1.s[1]\n"
+ "fmla z24.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z13.d\n"
+ "fmla z14.s, z18.s, z0.s[3]\n"
+ "fmla z26.s, z18.s, z1.s[0]\n"
+ "fmla z17.s, z18.s, z1.s[1]\n"
+ "fmla z21.s, z18.s, z1.s[2]\n"
"mov z0.d, z11.d\n"
- "fmla z14.s, z28.s, z1.s[3]\n"
- "fmla z16.s, z27.s, z0.s[0]\n"
- "fmla z25.s, z27.s, z0.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z22.s\n"
- "fmax z16.s, p1/M, z16.s, z21.s\n"
- "fmla z15.s, z27.s, z0.s[2]\n"
- "fmla z24.s, z27.s, z0.s[3]\n"
+ "fmla z16.s, z28.s, z0.s[0]\n"
+ "fmla z25.s, z28.s, z0.s[1]\n"
+ "fmla z15.s, z28.s, z0.s[2]\n"
+ "fmla z24.s, z28.s, z0.s[3]\n"
"mov z0.d, z13.d\n"
- "fmin z25.s, p1/M, z25.s, z22.s\n"
- "fmla z14.s, z27.s, z0.s[0]\n"
- "fmla z26.s, z27.s, z0.s[1]\n"
- "fmin z15.s, p1/M, z15.s, z22.s\n"
- "fmin z24.s, p1/M, z24.s, z22.s\n"
- "fmla z17.s, z27.s, z0.s[2]\n"
- "fmla z23.s, z27.s, z0.s[3]\n"
- "fmin z14.s, p1/M, z14.s, z22.s\n"
- "fmin z26.s, p1/M, z26.s, z22.s\n"
- "fmin z17.s, p1/M, z17.s, z22.s\n"
- "fmin z23.s, p1/M, z23.s, z22.s\n"
+ "fmla z14.s, z28.s, z0.s[0]\n"
+ "fmla z26.s, z28.s, z0.s[1]\n"
+ "fmla z17.s, z28.s, z0.s[2]\n"
+ "fmla z21.s, z28.s, z0.s[3]\n"
+ "fmin z16.s, p2/M, z16.s, z22.s\n"
+ "fmin z25.s, p2/M, z25.s, z22.s\n"
+ "fmin z15.s, p2/M, z15.s, z22.s\n"
+ "fmin z24.s, p2/M, z24.s, z22.s\n"
+ "fmin z14.s, p2/M, z14.s, z22.s\n"
+ "fmax z16.s, p2/M, z16.s, z23.s\n"
+ "fmin z26.s, p2/M, z26.s, z22.s\n"
+ "fmin z17.s, p2/M, z17.s, z22.s\n"
+ "fmin z21.s, p2/M, z21.s, z22.s\n"
+ "fmax z25.s, p2/M, z25.s, z23.s\n"
"st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "ld1w { z16.s }, p1/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "fmax z25.s, p1/M, z25.s, z21.s\n"
+ "fmax z15.s, p2/M, z15.s, z23.s\n"
+ "fmax z24.s, p2/M, z24.s, z23.s\n"
+ "fmax z14.s, p2/M, z14.s, z23.s\n"
+ "fmax z26.s, p2/M, z26.s, z23.s\n"
+ "fmax z17.s, p2/M, z17.s, z23.s\n"
"st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
"mov z25.d, z16.d\n"
- "fmax z15.s, p1/M, z15.s, z21.s\n"
- "fmax z24.s, p1/M, z24.s, z21.s\n"
+ "fmax z21.s, p2/M, z21.s, z23.s\n"
"st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
"mov z15.d, z16.d\n"
- "fmax z14.s, p1/M, z14.s, z21.s\n"
- "fmax z26.s, p1/M, z26.s, z21.s\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
"st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
"mov z24.d, z16.d\n"
- "fmax z17.s, p1/M, z17.s, z21.s\n"
- "fmax z23.s, p1/M, z23.s, z21.s\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "addvl %x[params], %x[params], #-6\n"
"st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
"mov z14.d, z16.d\n"
"st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
"mov z26.d, z16.d\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
"st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
"mov z17.d, z16.d\n"
- "addvl %x[params], %x[params], #-6\n"
- "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
+ "st1w { z21.s }, p0, [x20, x10, LSL #2]\n"
"incw x10\n"
- "mov z23.d, z16.d\n"
+ "mov z21.d, z16.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 1770ec182c..7681f346ea 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,8 +46,8 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
__asm__ __volatile__(
"ptrue p1.b\n"
"mov x9, #0x0\n"
- "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
- "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "ld1rw { z10.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z9.s }, p1/Z, [%x[minmax_vals], #4]\n"
"whilelt p0.s, x9, %x[n_output_channels]\n"
"1:" // Output channel loop
"mov z31.b, #0x0\n"
@@ -55,178 +55,178 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
"2:" // Output channel loop: Load bias: Done
"mov x23, %x[inptrs]\n"
- "ldp x21, x20, [x23], #0x10\n"
"lsr x22, %x[kernel_points], #0x1\n"
"mov z16.d, z31.d\n"
"mov z17.d, z31.d\n"
"mov z18.d, z31.d\n"
- "ld1rqw { z6.s }, p1/Z, [x21]\n"
- "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
"mov z19.d, z31.d\n"
+ "ld1w { z13.s }, p1/Z, [%x[weights]]\n"
+ "addvl %x[weights], %x[weights], #1\n"
"mov z20.d, z31.d\n"
- "ld1rqw { z1.s }, p1/Z, [x20]\n"
- "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
"mov z21.d, z31.d\n"
+ "ldp x21, x20, [x23], #0x10\n"
"mov z22.d, z31.d\n"
- "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "addvl %x[weights], %x[weights], #1\n"
"mov z23.d, z31.d\n"
"mov z24.d, z31.d\n"
"mov z25.d, z31.d\n"
"mov z26.d, z31.d\n"
"mov z27.d, z31.d\n"
+ "ld1rqw { z7.s }, p1/Z, [x21]\n"
+ "ld1rqw { z6.s }, p1/Z, [x21, #16]\n"
"mov z28.d, z31.d\n"
"mov z29.d, z31.d\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
"mov z30.d, z31.d\n"
"mov z31.d, z31.d\n"
"cbz x22, 6f\n"
"ldp x21, x20, [x23], #0x10\n"
"subs x22, x22, #0x1\n"
- "ld1rqw { z0.s }, p1/Z, [x21]\n"
- "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
- "ld1rqw { z7.s }, p1/Z, [x20]\n"
- "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
- "ld1w { z11.s }, p1/Z, [%x[weights]]\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
"addvl %x[weights], %x[weights], #1\n"
+ "ld1rqw { z5.s }, p1/Z, [x21]\n"
+ "ld1rqw { z3.s }, p1/Z, [x21, #16]\n"
+ "ld1rqw { z0.s }, p1/Z, [x20]\n"
+ "ld1rqw { z4.s }, p1/Z, [x20, #16]\n"
"beq 4f\n"
"3:" // Output channel loop: Kernel loop
"ldp x21, x20, [x23], #0x10\n"
- "fmla z16.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmla z16.s, z13.s, z7.s[0]\n"
+ "fmla z17.s, z13.s, z7.s[1]\n"
"subs x22, x22, #0x1\n"
- "fmla z18.s, z8.s, z6.s[2]\n"
- "fmla z19.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x21]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z21.s, z8.s, z5.s[1]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z23.s, z8.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
- "fmla z24.s, z8.s, z1.s[0]\n"
- "fmla z25.s, z8.s, z1.s[1]\n"
- "fmla z26.s, z8.s, z1.s[2]\n"
- "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z18.s, z13.s, z7.s[2]\n"
+ "fmla z19.s, z13.s, z7.s[3]\n"
+ "fmla z20.s, z13.s, z6.s[0]\n"
+ "fmla z21.s, z13.s, z6.s[1]\n"
+ "fmla z22.s, z13.s, z6.s[2]\n"
+ "fmla z23.s, z13.s, z6.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x21]\n"
+ "ld1rqw { z6.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z13.s, z1.s[0]\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "fmla z26.s, z13.s, z1.s[2]\n"
+ "fmla z27.s, z13.s, z1.s[3]\n"
"ld1rqw { z1.s }, p1/Z, [x20]\n"
- "fmla z28.s, z8.s, z2.s[0]\n"
- "fmla z29.s, z8.s, z2.s[1]\n"
- "fmla z30.s, z8.s, z2.s[2]\n"
- "fmla z31.s, z8.s, z2.s[3]\n"
+ "fmla z28.s, z13.s, z2.s[0]\n"
+ "fmla z29.s, z13.s, z2.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[2]\n"
+ "fmla z31.s, z13.s, z2.s[3]\n"
"ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
"ldp x21, x20, [x23], #0x10\n"
- "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
- "fmla z16.s, z11.s, z0.s[0]\n"
- "fmla z17.s, z11.s, z0.s[1]\n"
- "fmla z18.s, z11.s, z0.s[2]\n"
- "fmla z19.s, z11.s, z0.s[3]\n"
- "ld1rqw { z0.s }, p1/Z, [x21]\n"
- "fmla z20.s, z11.s, z4.s[0]\n"
- "fmla z21.s, z11.s, z4.s[1]\n"
- "fmla z22.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z4.s[3]\n"
- "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
- "fmla z24.s, z11.s, z7.s[0]\n"
- "fmla z25.s, z11.s, z7.s[1]\n"
- "fmla z26.s, z11.s, z7.s[2]\n"
- "fmla z27.s, z11.s, z7.s[3]\n"
- "ld1rqw { z7.s }, p1/Z, [x20]\n"
- "fmla z28.s, z11.s, z3.s[0]\n"
- "fmla z29.s, z11.s, z3.s[1]\n"
- "fmla z30.s, z11.s, z3.s[2]\n"
- "fmla z31.s, z11.s, z3.s[3]\n"
- "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
- "ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+ "ld1w { z13.s }, p1/Z, [%x[weights]]\n"
+ "fmla z16.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z8.s, z5.s[1]\n"
+ "fmla z18.s, z8.s, z5.s[2]\n"
+ "fmla z19.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z3.s[0]\n"
+ "fmla z21.s, z8.s, z3.s[1]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21]\n"
+ "fmla z22.s, z8.s, z3.s[2]\n"
+ "fmla z23.s, z8.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x20, #16]\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
"addvl %x[weights], %x[weights], #2\n"
"bgt 3b\n"
"4:" // Output channel loop: Kernel loop tail
"tbnz %x[kernel_points], #0, 5f\n"
- "fmla z16.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmla z16.s, z13.s, z7.s[0]\n"
+ "fmla z17.s, z13.s, z7.s[1]\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "fmla z18.s, z8.s, z6.s[2]\n"
- "fmla z19.s, z8.s, z6.s[3]\n"
+ "fmla z18.s, z13.s, z7.s[2]\n"
+ "fmla z19.s, z13.s, z7.s[3]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmla z20.s, z13.s, z6.s[0]\n"
+ "fmla z21.s, z13.s, z6.s[1]\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z23.s, z8.s, z5.s[3]\n"
+ "fmla z22.s, z13.s, z6.s[2]\n"
+ "fmla z23.s, z13.s, z6.s[3]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "fmla z24.s, z8.s, z1.s[0]\n"
- "fmla z25.s, z8.s, z1.s[1]\n"
- "fmla z26.s, z8.s, z1.s[2]\n"
- "fmla z27.s, z8.s, z1.s[3]\n"
- "fmla z28.s, z8.s, z2.s[0]\n"
- "fmla z29.s, z8.s, z2.s[1]\n"
- "fmla z30.s, z8.s, z2.s[2]\n"
- "fmla z31.s, z8.s, z2.s[3]\n"
- "fmla z16.s, z11.s, z0.s[0]\n"
- "fmla z17.s, z11.s, z0.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z14.s\n"
- "fmin z17.s, p1/M, z17.s, z14.s\n"
- "fmla z18.s, z11.s, z0.s[2]\n"
- "fmla z19.s, z11.s, z0.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z14.s\n"
- "fmin z19.s, p1/M, z19.s, z14.s\n"
- "fmla z20.s, z11.s, z4.s[0]\n"
- "fmla z21.s, z11.s, z4.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z14.s\n"
- "fmin z21.s, p1/M, z21.s, z14.s\n"
- "fmla z22.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z4.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z14.s\n"
- "fmin z23.s, p1/M, z23.s, z14.s\n"
- "fmla z24.s, z11.s, z7.s[0]\n"
- "fmla z25.s, z11.s, z7.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z15.s\n"
- "fmax z17.s, p1/M, z17.s, z15.s\n"
- "fmla z26.s, z11.s, z7.s[2]\n"
- "fmla z27.s, z11.s, z7.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z15.s\n"
- "fmax z19.s, p1/M, z19.s, z15.s\n"
- "fmla z28.s, z11.s, z3.s[0]\n"
- "fmla z29.s, z11.s, z3.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z15.s\n"
- "fmax z21.s, p1/M, z21.s, z15.s\n"
- "fmla z30.s, z11.s, z3.s[2]\n"
- "fmla z31.s, z11.s, z3.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z15.s\n"
- "fmax z23.s, p1/M, z23.s, z15.s\n"
- "fmin z24.s, p1/M, z24.s, z14.s\n"
- "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "fmla z24.s, z13.s, z1.s[0]\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "fmla z26.s, z13.s, z1.s[2]\n"
+ "fmla z27.s, z13.s, z1.s[3]\n"
+ "fmla z28.s, z13.s, z2.s[0]\n"
+ "fmla z29.s, z13.s, z2.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[2]\n"
+ "fmla z31.s, z13.s, z2.s[3]\n"
+ "fmla z16.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z8.s, z5.s[1]\n"
+ "fmla z18.s, z8.s, z5.s[2]\n"
+ "fmla z19.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z3.s[0]\n"
+ "fmla z21.s, z8.s, z3.s[1]\n"
+ "fmla z22.s, z8.s, z3.s[2]\n"
+ "fmla z23.s, z8.s, z3.s[3]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z9.s\n"
+ "fmin z17.s, p1/M, z17.s, z9.s\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z9.s\n"
+ "fmin z19.s, p1/M, z19.s, z9.s\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z9.s\n"
+ "fmin z21.s, p1/M, z21.s, z9.s\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z9.s\n"
+ "fmin z23.s, p1/M, z23.s, z9.s\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
"st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z14.s\n"
- "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "fmin z24.s, p1/M, z24.s, z9.s\n"
+ "fmin z25.s, p1/M, z25.s, z9.s\n"
"st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z14.s\n"
- "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "fmin z26.s, p1/M, z26.s, z9.s\n"
+ "fmin z27.s, p1/M, z27.s, z9.s\n"
"st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z14.s\n"
- "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "fmin z28.s, p1/M, z28.s, z9.s\n"
+ "fmin z29.s, p1/M, z29.s, z9.s\n"
"st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin z30.s, p1/M, z30.s, z9.s\n"
+ "fmin z31.s, p1/M, z31.s, z9.s\n"
"st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z15.s\n"
- "fmax z25.s, p1/M, z25.s, z15.s\n"
"st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z15.s\n"
- "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
"st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z15.s\n"
- "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
"st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z15.s\n"
- "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
"st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
"st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
"st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
@@ -237,117 +237,117 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"5:" // Output channel loop: Odd tail
- "fmla z16.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmla z16.s, z13.s, z7.s[0]\n"
+ "fmla z17.s, z13.s, z7.s[1]\n"
"ldp x20, x28, [x23], #0x10\n"
"ldr x27, [%x[outptrs], #0x0]\n"
- "fmla z18.s, z8.s, z6.s[2]\n"
- "fmla z19.s, z8.s, z6.s[3]\n"
- "ld1rqw { z6.s }, p1/Z, [x20]\n"
+ "fmla z18.s, z13.s, z7.s[2]\n"
+ "fmla z19.s, z13.s, z7.s[3]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z21.s, z8.s, z5.s[1]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla z20.s, z13.s, z6.s[0]\n"
+ "fmla z21.s, z13.s, z6.s[1]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z23.s, z8.s, z5.s[3]\n"
- "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
"ldr x23, [%x[outptrs], #0x20]\n"
- "fmla z24.s, z8.s, z1.s[0]\n"
- "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z22.s, z13.s, z6.s[2]\n"
+ "fmla z23.s, z13.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x20]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20, #16]\n"
+ "fmla z24.s, z13.s, z1.s[0]\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
- "fmla z26.s, z8.s, z1.s[2]\n"
- "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z26.s, z13.s, z1.s[2]\n"
+ "fmla z27.s, z13.s, z1.s[3]\n"
"ld1rqw { z1.s }, p1/Z, [x28]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "fmla z28.s, z8.s, z2.s[0]\n"
- "fmla z29.s, z8.s, z2.s[1]\n"
- "fmla z30.s, z8.s, z2.s[2]\n"
- "fmla z31.s, z8.s, z2.s[3]\n"
- "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+ "fmla z28.s, z13.s, z2.s[0]\n"
+ "fmla z29.s, z13.s, z2.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[2]\n"
+ "fmla z31.s, z13.s, z2.s[3]\n"
+ "ld1w { z13.s }, p1/Z, [%x[weights]]\n"
"ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
- "fmla z16.s, z11.s, z0.s[0]\n"
- "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmla z16.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z8.s, z5.s[1]\n"
"addvl %x[weights], %x[weights], #1\n"
- "fmla z18.s, z11.s, z0.s[2]\n"
- "fmla z19.s, z11.s, z0.s[3]\n"
- "fmla z20.s, z11.s, z4.s[0]\n"
- "fmla z21.s, z11.s, z4.s[1]\n"
- "fmla z22.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z4.s[3]\n"
- "fmla z24.s, z11.s, z7.s[0]\n"
- "fmla z25.s, z11.s, z7.s[1]\n"
- "fmla z26.s, z11.s, z7.s[2]\n"
- "fmla z27.s, z11.s, z7.s[3]\n"
- "fmla z28.s, z11.s, z3.s[0]\n"
- "fmla z29.s, z11.s, z3.s[1]\n"
- "fmla z30.s, z11.s, z3.s[2]\n"
- "fmla z31.s, z11.s, z3.s[3]\n"
- "fmla z16.s, z10.s, z6.s[0]\n"
- "fmla z17.s, z10.s, z6.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z14.s\n"
- "fmin z17.s, p1/M, z17.s, z14.s\n"
- "fmla z18.s, z10.s, z6.s[2]\n"
- "fmla z19.s, z10.s, z6.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z14.s\n"
- "fmin z19.s, p1/M, z19.s, z14.s\n"
- "fmla z20.s, z10.s, z5.s[0]\n"
- "fmla z21.s, z10.s, z5.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z14.s\n"
- "fmin z21.s, p1/M, z21.s, z14.s\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z23.s, z10.s, z5.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z14.s\n"
- "fmin z23.s, p1/M, z23.s, z14.s\n"
- "fmla z24.s, z10.s, z1.s[0]\n"
- "fmla z25.s, z10.s, z1.s[1]\n"
- "fmax z16.s, p1/M, z16.s, z15.s\n"
- "fmax z17.s, p1/M, z17.s, z15.s\n"
- "fmla z26.s, z10.s, z1.s[2]\n"
- "fmla z27.s, z10.s, z1.s[3]\n"
- "fmax z18.s, p1/M, z18.s, z15.s\n"
- "fmax z19.s, p1/M, z19.s, z15.s\n"
- "fmla z28.s, z10.s, z2.s[0]\n"
- "fmla z29.s, z10.s, z2.s[1]\n"
- "fmax z20.s, p1/M, z20.s, z15.s\n"
- "fmax z21.s, p1/M, z21.s, z15.s\n"
- "fmla z30.s, z10.s, z2.s[2]\n"
- "fmla z31.s, z10.s, z2.s[3]\n"
- "fmax z22.s, p1/M, z22.s, z15.s\n"
- "fmax z23.s, p1/M, z23.s, z15.s\n"
- "fmin z24.s, p1/M, z24.s, z14.s\n"
- "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "fmla z18.s, z8.s, z5.s[2]\n"
+ "fmla z19.s, z8.s, z5.s[3]\n"
+ "fmla z20.s, z8.s, z3.s[0]\n"
+ "fmla z21.s, z8.s, z3.s[1]\n"
+ "fmla z22.s, z8.s, z3.s[2]\n"
+ "fmla z23.s, z8.s, z3.s[3]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "fmla z26.s, z8.s, z0.s[2]\n"
+ "fmla z27.s, z8.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "fmla z29.s, z8.s, z4.s[1]\n"
+ "fmla z30.s, z8.s, z4.s[2]\n"
+ "fmla z31.s, z8.s, z4.s[3]\n"
+ "fmla z16.s, z13.s, z6.s[0]\n"
+ "fmla z17.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z13.s, z6.s[2]\n"
+ "fmla z19.s, z13.s, z6.s[3]\n"
+ "fmla z20.s, z13.s, z7.s[0]\n"
+ "fmla z21.s, z13.s, z7.s[1]\n"
+ "fmla z22.s, z13.s, z7.s[2]\n"
+ "fmla z23.s, z13.s, z7.s[3]\n"
+ "fmla z24.s, z13.s, z1.s[0]\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z9.s\n"
+ "fmin z17.s, p1/M, z17.s, z9.s\n"
+ "fmla z26.s, z13.s, z1.s[2]\n"
+ "fmla z27.s, z13.s, z1.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z9.s\n"
+ "fmin z19.s, p1/M, z19.s, z9.s\n"
+ "fmla z28.s, z13.s, z2.s[0]\n"
+ "fmla z29.s, z13.s, z2.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z9.s\n"
+ "fmin z21.s, p1/M, z21.s, z9.s\n"
+ "fmla z30.s, z13.s, z2.s[2]\n"
+ "fmla z31.s, z13.s, z2.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z9.s\n"
+ "fmin z23.s, p1/M, z23.s, z9.s\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
"st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "fmin z26.s, p1/M, z26.s, z14.s\n"
- "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "fmin z24.s, p1/M, z24.s, z9.s\n"
+ "fmin z25.s, p1/M, z25.s, z9.s\n"
"st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "fmin z28.s, p1/M, z28.s, z14.s\n"
- "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "fmin z26.s, p1/M, z26.s, z9.s\n"
+ "fmin z27.s, p1/M, z27.s, z9.s\n"
"st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "fmin z30.s, p1/M, z30.s, z14.s\n"
- "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "fmin z28.s, p1/M, z28.s, z9.s\n"
+ "fmin z29.s, p1/M, z29.s, z9.s\n"
"st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin z30.s, p1/M, z30.s, z9.s\n"
+ "fmin z31.s, p1/M, z31.s, z9.s\n"
"st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "fmax z24.s, p1/M, z24.s, z15.s\n"
- "fmax z25.s, p1/M, z25.s, z15.s\n"
"st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "fmax z26.s, p1/M, z26.s, z15.s\n"
- "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
"st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "fmax z28.s, p1/M, z28.s, z15.s\n"
- "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
"st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "fmax z30.s, p1/M, z30.s, z15.s\n"
- "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
"st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
"st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
"st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
@@ -358,81 +358,81 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
"b 7f\n"
"6:" // Output channel loop: Single kernel point
- "fmla z16.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z8.s, z6.s[1]\n"
- "fmin z16.s, p1/M, z16.s, z14.s\n"
- "fmin z17.s, p1/M, z17.s, z14.s\n"
- "fmla z18.s, z8.s, z6.s[2]\n"
- "fmla z19.s, z8.s, z6.s[3]\n"
- "fmin z18.s, p1/M, z18.s, z14.s\n"
- "fmin z19.s, p1/M, z19.s, z14.s\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z21.s, z8.s, z5.s[1]\n"
- "fmin z20.s, p1/M, z20.s, z14.s\n"
- "fmin z21.s, p1/M, z21.s, z14.s\n"
- "fmla z22.s, z8.s, z5.s[2]\n"
- "fmla z23.s, z8.s, z5.s[3]\n"
- "fmin z22.s, p1/M, z22.s, z14.s\n"
- "fmin z23.s, p1/M, z23.s, z14.s\n"
- "fmla z24.s, z8.s, z1.s[0]\n"
- "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z16.s, z13.s, z7.s[0]\n"
+ "fmla z17.s, z13.s, z7.s[1]\n"
"ldr x27, [%x[outptrs], #0x0]\n"
"ldr x26, [%x[outptrs], #0x8]\n"
- "fmla z26.s, z8.s, z1.s[2]\n"
- "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z18.s, z13.s, z7.s[2]\n"
+ "fmla z19.s, z13.s, z7.s[3]\n"
"ldr x25, [%x[outptrs], #0x10]\n"
"ldr x24, [%x[outptrs], #0x18]\n"
- "fmla z28.s, z8.s, z2.s[0]\n"
- "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z20.s, z13.s, z6.s[0]\n"
+ "fmla z21.s, z13.s, z6.s[1]\n"
"ldr x23, [%x[outptrs], #0x20]\n"
"ldr x22, [%x[outptrs], #0x28]\n"
- "fmla z30.s, z8.s, z2.s[2]\n"
- "fmla z31.s, z8.s, z2.s[3]\n"
+ "fmla z22.s, z13.s, z6.s[2]\n"
+ "fmla z23.s, z13.s, z6.s[3]\n"
"ldr x21, [%x[outptrs], #0x30]\n"
"ldr x20, [%x[outptrs], #0x38]\n"
- "fmax z16.s, p1/M, z16.s, z15.s\n"
- "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z24.s, z13.s, z1.s[0]\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z9.s\n"
+ "fmin z17.s, p1/M, z17.s, z9.s\n"
+ "fmla z26.s, z13.s, z1.s[2]\n"
+ "fmla z27.s, z13.s, z1.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z9.s\n"
+ "fmin z19.s, p1/M, z19.s, z9.s\n"
+ "fmla z28.s, z13.s, z2.s[0]\n"
+ "fmla z29.s, z13.s, z2.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z9.s\n"
+ "fmin z21.s, p1/M, z21.s, z9.s\n"
+ "fmla z30.s, z13.s, z2.s[2]\n"
+ "fmla z31.s, z13.s, z2.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z9.s\n"
+ "fmin z23.s, p1/M, z23.s, z9.s\n"
+ "fmax z16.s, p1/M, z16.s, z10.s\n"
+ "fmax z17.s, p1/M, z17.s, z10.s\n"
+ "fmax z18.s, p1/M, z18.s, z10.s\n"
+ "fmax z19.s, p1/M, z19.s, z10.s\n"
+ "fmax z20.s, p1/M, z20.s, z10.s\n"
+ "fmax z21.s, p1/M, z21.s, z10.s\n"
+ "fmax z22.s, p1/M, z22.s, z10.s\n"
+ "fmax z23.s, p1/M, z23.s, z10.s\n"
"st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
"ldr x27, [%x[outptrs], #0x40]\n"
- "fmax z18.s, p1/M, z18.s, z15.s\n"
- "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z9.s\n"
+ "fmin z25.s, p1/M, z25.s, z9.s\n"
"st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
"ldr x26, [%x[outptrs], #0x48]\n"
- "fmax z20.s, p1/M, z20.s, z15.s\n"
- "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmin z26.s, p1/M, z26.s, z9.s\n"
+ "fmin z27.s, p1/M, z27.s, z9.s\n"
"st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
"ldr x25, [%x[outptrs], #0x50]\n"
- "fmax z22.s, p1/M, z22.s, z15.s\n"
- "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z28.s, p1/M, z28.s, z9.s\n"
+ "fmin z29.s, p1/M, z29.s, z9.s\n"
"st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
"ldr x24, [%x[outptrs], #0x58]\n"
- "fmin z24.s, p1/M, z24.s, z14.s\n"
- "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "fmin z30.s, p1/M, z30.s, z9.s\n"
+ "fmin z31.s, p1/M, z31.s, z9.s\n"
"st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
"ldr x23, [%x[outptrs], #0x60]\n"
- "fmin z26.s, p1/M, z26.s, z14.s\n"
- "fmin z27.s, p1/M, z27.s, z14.s\n"
"st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
"ldr x22, [%x[outptrs], #0x68]\n"
- "fmin z28.s, p1/M, z28.s, z14.s\n"
- "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "fmax z24.s, p1/M, z24.s, z10.s\n"
+ "fmax z25.s, p1/M, z25.s, z10.s\n"
"st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
"ldr x21, [%x[outptrs], #0x70]\n"
- "fmin z30.s, p1/M, z30.s, z14.s\n"
- "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "fmax z26.s, p1/M, z26.s, z10.s\n"
+ "fmax z27.s, p1/M, z27.s, z10.s\n"
"st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
"ldr x20, [%x[outptrs], #0x78]\n"
- "fmax z24.s, p1/M, z24.s, z15.s\n"
- "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "fmax z28.s, p1/M, z28.s, z10.s\n"
+ "fmax z29.s, p1/M, z29.s, z10.s\n"
+ "fmax z30.s, p1/M, z30.s, z10.s\n"
+ "fmax z31.s, p1/M, z31.s, z10.s\n"
"st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
- "fmax z26.s, p1/M, z26.s, z15.s\n"
- "fmax z27.s, p1/M, z27.s, z15.s\n"
"st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
- "fmax z28.s, p1/M, z28.s, z15.s\n"
- "fmax z29.s, p1/M, z29.s, z15.s\n"
"st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
- "fmax z30.s, p1/M, z30.s, z15.s\n"
- "fmax z31.s, p1/M, z31.s, z15.s\n"
"st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
"st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
"st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
@@ -444,7 +444,7 @@ void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
"b.any 1b\n"
: [weights] "+&r" (weights)
: [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
- : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 0cee302c56..4149e0c117 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,456 +34,456 @@ void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_chan
{
__asm__ __volatile__(
"mov x14, #0x0\n"
- "whilelt p0.b, x14, %x[n_channels]\n"
"ldp x27, x26, [%x[inptrs], #0x0]\n"
"ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "mov x28, #0x1\n"
"ldp x23, x22, [%x[inptrs], #0x20]\n"
- "ldp x13, x21, [%x[inptrs], #0x30]\n"
- "mov x20, #0x1\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
"ptrue p2.b\n"
+ "mov x13, #0x0\n"
"ldp x12, x11, [%x[outptrs], #0x0]\n"
"ldp x10, x9, [%x[outptrs], #0x10]\n"
- "orr x20, x20, #0x100\n"
- "orr x20, x20, #0x10000\n"
- "ld1b { z15.b }, p0/Z, [x27, x14]\n"
- "ld1b { z21.b }, p0/Z, [x26, x14]\n"
- "dup z25.s, w20\n"
- "mov x28, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "orr x28, x28, #0x100\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1b { z12.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x26, x14]\n"
+ "orr x28, x28, #0x10000\n"
"ldp x27, x26, [%x[inptrs], #0x40]\n"
- "ld1b { z31.b }, p0/Z, [x25, x14]\n"
- "zip2 z16.b, z15.b, z31.b\n"
- "zip1 z15.b, z15.b, z31.b\n"
- "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z14.b }, p0/Z, [x24, x14]\n"
"ldp x25, x24, [%x[inptrs], #0x50]\n"
- "zip1 z30.b, z21.b, z29.b\n"
- "zip2 z29.b, z21.b, z29.b\n"
- "ld1b { z9.b }, p0/Z, [x23, x14]\n"
- "ld1b { z20.b }, p0/Z, [x22, x14]\n"
- "zip2 z13.b, z15.b, z30.b\n"
- "zip1 z15.b, z15.b, z30.b\n"
+ "ld1b { z5.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z3.b }, p0/Z, [x22, x14]\n"
+ "dup z9.s, w28\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
- "ld1b { z5.b }, p0/Z, [x13, x14]\n"
- "zip1 z14.b, z16.b, z29.b\n"
- "zip2 z29.b, z16.b, z29.b\n"
- "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x21, x14]\n"
+ "zip2 z18.b, z12.b, z26.b\n"
+ "zip1 z12.b, z12.b, z26.b\n"
+ "ld1b { z30.b }, p0/Z, [x20, x14]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip2 z31.b, z9.b, z5.b\n"
- "zip1 z9.b, z9.b, z5.b\n"
- "ld1b { z18.b }, p0/Z, [x27, x14]\n"
- "ld1b { z28.b }, p0/Z, [x26, x14]\n"
- "zip1 z21.b, z20.b, z17.b\n"
- "zip2 z17.b, z20.b, z17.b\n"
- "ld1b { z6.b }, p0/Z, [x25, x14]\n"
- "ld1b { z4.b }, p0/Z, [x24, x14]\n"
- "zip2 z23.b, z18.b, z6.b\n"
- "zip1 z18.b, z18.b, z6.b\n"
- "ld1b { z2.b }, p0/Z, [x23, x14]\n"
- "ld1b { z19.b }, p0/Z, [x22, x14]\n"
- "zip1 z24.b, z28.b, z4.b\n"
- "zip2 z4.b, z28.b, z4.b\n"
- "ld1b { z16.b }, p0/Z, [x21, x14]\n"
- "ld1b { z5.b }, p0/Z, [x20, x14]\n"
- "zip2 z22.b, z2.b, z16.b\n"
- "zip1 z2.b, z2.b, z16.b\n"
- "zip1 z0.b, z19.b, z5.b\n"
- "zip2 z5.b, z19.b, z5.b\n"
- "ld1w { z10.s }, p2/Z, [%x[params]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z19.b, z9.b, z21.b\n"
- "zip1 z9.b, z9.b, z21.b\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "zip1 z11.b, z31.b, z17.b\n"
- "zip2 z17.b, z31.b, z17.b\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip1 z17.b, z24.b, z14.b\n"
+ "zip2 z14.b, z24.b, z14.b\n"
+ "ld1b { z29.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x24, x14]\n"
+ "zip2 z22.b, z5.b, z19.b\n"
+ "zip1 z5.b, z5.b, z19.b\n"
+ "ld1b { z6.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x22, x14]\n"
+ "zip2 z2.b, z12.b, z17.b\n"
+ "zip1 z12.b, z12.b, z17.b\n"
+ "ld1b { z23.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "zip1 z8.b, z18.b, z14.b\n"
+ "zip2 z14.b, z18.b, z14.b\n"
+ "zip1 z26.b, z3.b, z30.b\n"
+ "zip2 z30.b, z3.b, z30.b\n"
+ "ld1w { z0.s }, p2/Z, [%x[params]]\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "zip2 z24.b, z29.b, z16.b\n"
+ "zip1 z29.b, z29.b, z16.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
- "zip2 z12.b, z18.b, z24.b\n"
- "zip1 z18.b, z18.b, z24.b\n"
+ "zip1 z16.b, z25.b, z7.b\n"
+ "zip2 z7.b, z25.b, z7.b\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "zip1 z20.b, z23.b, z4.b\n"
- "zip2 z4.b, z23.b, z4.b\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z24.b, z2.b, z0.b\n"
- "zip1 z2.b, z2.b, z0.b\n"
- "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z0.b, z22.b, z5.b\n"
- "zip2 z5.b, z22.b, z5.b\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z25.b, z6.b, z23.b\n"
+ "zip1 z6.b, z6.b, z23.b\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z19.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z22.d, z10.d\n"
- "mov z31.d, z10.d\n"
- "mov z21.d, z10.d\n"
+ "zip2 z23.b, z5.b, z26.b\n"
+ "zip1 z5.b, z5.b, z26.b\n"
+ "zip1 z3.b, z22.b, z30.b\n"
+ "zip2 z30.b, z22.b, z30.b\n"
+ "zip2 z11.b, z29.b, z16.b\n"
+ "zip1 z29.b, z29.b, z16.b\n"
+ "zip1 z16.b, z24.b, z7.b\n"
+ "zip2 z7.b, z24.b, z7.b\n"
+ "zip2 z1.b, z6.b, z19.b\n"
+ "zip1 z6.b, z6.b, z19.b\n"
+ "zip1 z27.b, z25.b, z4.b\n"
+ "zip2 z4.b, z25.b, z4.b\n"
+ "mov z26.d, z0.d\n"
+ "mov z25.d, z0.d\n"
+ "mov z28.d, z0.d\n"
"1:" // Loop
- "mov z30.s, #0x0\n"
- "sdot z30.s, z25.b, z9.b\n"
- "sdot z10.s, z26.b, z15.b\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "sdot z30.s, z25.b, z18.b\n"
- "sdot z31.s, z26.b, z9.b\n"
- "mov z27.s, #0x0\n"
- "incw x14, ALL, MUL #4\n"
- "sdot z10.s, z3.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
- "sdot z30.s, z25.b, z15.b\n"
- "ext z15.b, z15.b, z15.b, #0x1\n"
- "sdot z27.s, z25.b, z9.b\n"
- "sdot z31.s, z3.b, z18.b\n"
- "sdot z10.s, z1.b, z18.b\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "sdot z22.s, z26.b, z15.b\n"
- "sdot z21.s, z26.b, z9.b\n"
- "sdot z27.s, z25.b, z18.b\n"
- "sdot z31.s, z1.b, z2.b\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "sdot z22.s, z3.b, z9.b\n"
- "sdot z21.s, z3.b, z18.b\n"
- "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p2/M, z30.s, z8.s\n"
- "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
- "mov z9.s, #0x0\n"
- "sdot z27.s, z25.b, z15.b\n"
- "ld1w { z23.s }, p2/Z, [%x[params]]\n"
- "sdot z22.s, z1.b, z18.b\n"
- ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
- "sdot z21.s, z1.b, z2.b\n"
- "mls z22.s, p2/M, z27.s, z8.s\n"
- "and z18.d, z10.d, z3.d\n"
- "mls z31.s, p2/M, z28.s, z8.s\n"
- "mls z21.s, p2/M, z26.s, z8.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- "sdot z9.s, z25.b, z19.b\n"
- ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
- "sqadd z10.s, z10.s, z18.s\n"
- ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
- "sdot z9.s, z25.b, z12.b\n"
- "and z28.d, z22.d, z3.d\n"
- "and z23.d, z31.d, z3.d\n"
- "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z18.d, z21.d, z3.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "sdot z9.s, z25.b, z13.b\n"
- "asr z23.s, z23.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z22.s, z22.s, z28.s\n"
- "sqadd z31.s, z31.s, z23.s\n"
- ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
- ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
- "sqadd z21.s, z21.s, z18.s\n"
- "add z10.s, z10.s, z16.s\n"
- ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
- "smax z10.s, p2/M, z10.s, z7.s\n"
- "add z22.s, z22.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smax z22.s, p2/M, z22.s, z7.s\n"
- "add z21.s, z21.s, z16.s\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "smax z21.s, p2/M, z21.s, z7.s\n"
- "st1b { z10.s }, p0, [x12, x28]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "st1b { z22.s }, p0, [x11, x28]\n"
- "mov z26.d, z28.d\n"
- "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x10, x28]\n"
- "mov z31.d, z28.d\n"
- "sdot z31.s, z1.b, z19.b\n"
- "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z21.s }, p0, [x9, x28]\n"
- "mov z22.d, z28.d\n"
- "sdot z28.s, z1.b, z13.b\n"
- "sdot z28.s, z15.b, z19.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
- "sdot z26.s, z1.b, z13.b\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z24.s, #0x0\n"
+ "sdot z0.s, z17.b, z12.b\n"
+ "sdot z25.s, z17.b, z5.b\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
"mov z18.s, #0x0\n"
- "sdot z22.s, z1.b, z19.b\n"
- "sdot z18.s, z25.b, z19.b\n"
- "incw x28\n"
- "sdot z31.s, z15.b, z12.b\n"
- "sdot z28.s, z23.b, z12.b\n"
+ "incw x14, ALL, MUL #4\n"
+ "sdot z24.s, z9.b, z5.b\n"
+ "sdot z0.s, z20.b, z5.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "sdot z25.s, z20.b, z29.b\n"
+ "sdot z24.s, z9.b, z29.b\n"
+ "sdot z18.s, z9.b, z5.b\n"
+ "sdot z0.s, z10.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "sdot z28.s, z17.b, z5.b\n"
+ "movprfx z19, z24\n sdot z19.s, z9.b, z6.b\n"
+ "sdot z24.s, z9.b, z12.b\n"
"ext z12.b, z12.b, z12.b, #0x1\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "sdot z26.s, z15.b, z19.b\n"
- "sdot z22.s, z15.b, z12.b\n"
+ "sdot z25.s, z10.b, z6.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "sdot z18.s, z9.b, z29.b\n"
+ "sdot z26.s, z17.b, z12.b\n"
+ "sdot z28.s, z20.b, z29.b\n"
+ "mls z0.s, p2/M, z24.s, z13.s\n"
+ "mov z22.s, #0x0\n"
+ "mls z25.s, p2/M, z19.s, z13.s\n"
+ "sdot z22.s, z9.b, z23.b\n"
+ "sdot z26.s, z20.b, z5.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "movprfx z5, z18\n sdot z5.s, z9.b, z6.b\n"
+ "sdot z18.s, z9.b, z12.b\n"
+ "ld1w { z19.s }, p2/Z, [%x[params]]\n"
+ "sdot z28.s, z10.b, z6.b\n"
+ "sdot z22.s, z9.b, z11.b\n"
+ "sdot z26.s, z10.b, z29.b\n"
+ ".inst 0x04b37400 // sqrdmulh z0.s, z0.s, z19.s\n"
+ ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
+ "mls z28.s, p2/M, z5.s, z13.s\n"
+ "and z5.d, z0.d, z20.d\n"
+ "mls z26.s, p2/M, z18.s, z13.s\n"
+ "mov z18.s, #0x0\n"
+ "and z12.d, z25.d, z20.d\n"
+ "movprfx z10, z22\n sdot z10.s, z9.b, z1.b\n"
+ "sdot z22.s, z9.b, z2.b\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
+ ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "asr z12.s, z12.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z5.s\n"
+ "and z19.d, z26.d, z20.d\n"
+ "and z6.d, z28.d, z20.d\n"
+ ".inst 0x44828a80 // srshl z0.s, p2/M, z0.s, z20.s\n"
+ "sqadd z25.s, z25.s, z12.s\n"
+ "ld1b { z5.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "add z0.s, z0.s, z21.s\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ "sqadd z28.s, z28.s, z6.s\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smax z0.s, p2/M, z0.s, z31.s\n"
+ ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
+ ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z26.s, z26.s, z21.s\n"
+ "smin z0.s, p2/M, z0.s, z15.s\n"
+ "add z28.s, z28.s, z21.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "st1b { z0.s }, p0, [x12, x13]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z18.s, z25.b, z12.b\n"
- "sdot z31.s, z23.b, z24.b\n"
- "ext z24.b, z24.b, z24.b, #0x1\n"
- "mls z28.s, p2/M, z9.s, z8.s\n"
- "sdot z26.s, z23.b, z12.b\n"
- ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
- "sdot z22.s, z23.b, z24.b\n"
- "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
- "and z2.d, z28.d, z21.d\n"
- "sdot z18.s, z25.b, z13.b\n"
- "mls z26.s, p2/M, z18.s, z8.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "mls z31.s, p2/M, z27.s, z8.s\n"
- "mls z22.s, p2/M, z12.s, z8.s\n"
- ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
- ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z28.s, z28.s, z2.s\n"
- "and z24.d, z26.d, z21.d\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "and z23.d, z31.d, z21.d\n"
- "and z18.d, z22.d, z21.d\n"
- "asr z24.s, z24.s, #0x1f\n"
- "asr z23.s, z23.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z26.s, z26.s, z24.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z23.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "add z28.s, z28.s, z16.s\n"
- "smax z28.s, p2/M, z28.s, z7.s\n"
- "add z26.s, z26.s, z16.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "add z31.s, z31.s, z16.s\n"
- "add z22.s, z22.s, z16.s\n"
- "smax z26.s, p2/M, z26.s, z7.s\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "mov z24.s, #0x0\n"
- "sdot z24.s, z25.b, z11.b\n"
- "smax z22.s, p2/M, z22.s, z7.s\n"
- "st1b { z28.s }, p0, [x12, x28]\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "st1b { z26.s }, p0, [x11, x28]\n"
- "mov z28.d, z23.d\n"
- "sdot z24.s, z25.b, z20.b\n"
- "st1b { z31.s }, p0, [x10, x28]\n"
- "mov z27.d, z23.d\n"
- "sdot z27.s, z19.b, z11.b\n"
- "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
- "st1b { z22.s }, p0, [x9, x28]\n"
- "mov z26.d, z23.d\n"
- "sdot z23.s, z19.b, z14.b\n"
- "sdot z23.s, z30.b, z11.b\n"
- "sdot z24.s, z25.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z28.s, z19.b, z14.b\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "st1b { z26.s }, p0, [x11, x13]\n"
+ "mov z6.d, z29.d\n"
+ "st1b { z25.s }, p0, [x10, x13]\n"
+ "mov z25.d, z29.d\n"
+ "st1b { z28.s }, p0, [x9, x13]\n"
+ "mov z0.d, z29.d\n"
+ "sdot z29.s, z17.b, z2.b\n"
+ "incw x13\n"
+ "sdot z25.s, z17.b, z23.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "sdot z29.s, z19.b, z23.b\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
+ "sdot z6.s, z17.b, z2.b\n"
+ "sdot z0.s, z17.b, z23.b\n"
+ "sdot z18.s, z9.b, z23.b\n"
+ "sdot z25.s, z19.b, z11.b\n"
+ "sdot z29.s, z5.b, z11.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "mov z12.s, #0x0\n"
- "sdot z26.s, z19.b, z11.b\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z12.s, z25.b, z11.b\n"
- "sdot z27.s, z30.b, z20.b\n"
- "incw x28\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "sdot z23.s, z21.b, z20.b\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
- "sdot z28.s, z30.b, z11.b\n"
- "sdot z26.s, z30.b, z20.b\n"
- "sdot z12.s, z25.b, z20.b\n"
- "sdot z27.s, z21.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "mls z23.s, p2/M, z24.s, z8.s\n"
- "sdot z28.s, z21.b, z20.b\n"
- "sdot z26.s, z21.b, z0.b\n"
- ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
- "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
- "sdot z12.s, z25.b, z14.b\n"
- "and z18.d, z23.d, z22.d\n"
- "mls z28.s, p2/M, z12.s, z8.s\n"
- "mls z27.s, p2/M, z13.s, z8.s\n"
+ "sdot z6.s, z19.b, z23.b\n"
+ "sdot z0.s, z19.b, z11.b\n"
+ "sdot z18.s, z9.b, z11.b\n"
+ "sdot z25.s, z5.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "mls z29.s, p2/M, z22.s, z13.s\n"
+ "mov z28.s, #0x0\n"
+ "sdot z6.s, z5.b, z11.b\n"
+ "sdot z0.s, z5.b, z1.b\n"
+ "movprfx z11, z18\n sdot z11.s, z9.b, z1.b\n"
+ "sdot z18.s, z9.b, z2.b\n"
+ "sdot z28.s, z9.b, z3.b\n"
+ ".inst 0x04b877bd // sqrdmulh z29.s, z29.s, z24.s\n"
+ "mls z25.s, p2/M, z10.s, z13.s\n"
+ "mls z6.s, p2/M, z18.s, z13.s\n"
+ "mov z1.s, #0x0\n"
+ "mls z0.s, p2/M, z11.s, z13.s\n"
+ "and z11.d, z29.d, z12.d\n"
+ ".inst 0x04b87739 // sqrdmulh z25.s, z25.s, z24.s\n"
+ "sdot z28.s, z9.b, z16.b\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ ".inst 0x04b874c6 // sqrdmulh z6.s, z6.s, z24.s\n"
+ ".inst 0x04b87400 // sqrdmulh z0.s, z0.s, z24.s\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "and z22.d, z25.d, z12.d\n"
+ "sqadd z29.s, z29.s, z11.s\n"
+ "and z18.d, z6.d, z12.d\n"
+ "movprfx z24, z28\n sdot z24.s, z9.b, z27.b\n"
+ "sdot z28.s, z9.b, z8.b\n"
+ "and z11.d, z0.d, z12.d\n"
+ "asr z22.s, z22.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- "mls z26.s, p2/M, z19.s, z8.s\n"
- ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
- ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
- ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
- "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z23.s, z23.s, z18.s\n"
- "and z20.d, z28.d, z22.d\n"
- ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
- "and z19.d, z27.d, z22.d\n"
- "and z18.d, z26.d, z22.d\n"
- "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x4482899d // srshl z29.s, p2/M, z29.s, z12.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z25.s, z25.s, z22.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "add z29.s, z29.s, z21.s\n"
+ "sqadd z0.s, z0.s, z11.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ ".inst 0x44828986 // srshl z6.s, p2/M, z6.s, z12.s\n"
+ ".inst 0x44828999 // srshl z25.s, p2/M, z25.s, z12.s\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ ".inst 0x44828980 // srshl z0.s, p2/M, z0.s, z12.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "add z6.s, z6.s, z21.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z0.s, z0.s, z21.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "smax z6.s, p2/M, z6.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z0.s, p2/M, z0.s, z31.s\n"
+ "st1b { z29.s }, p0, [x12, x13]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "smin z6.s, p2/M, z6.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "smin z0.s, p2/M, z0.s, z15.s\n"
+ "st1b { z6.s }, p0, [x11, x13]\n"
+ "mov z11.d, z29.d\n"
+ "st1b { z25.s }, p0, [x10, x13]\n"
+ "mov z26.d, z29.d\n"
+ "st1b { z0.s }, p0, [x9, x13]\n"
+ "mov z25.d, z29.d\n"
+ "sdot z29.s, z18.b, z8.b\n"
+ "incw x13\n"
+ "sdot z26.s, z18.b, z3.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "sdot z29.s, z20.b, z3.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "sdot z11.s, z18.b, z8.b\n"
+ "sdot z25.s, z18.b, z3.b\n"
+ "sdot z1.s, z9.b, z3.b\n"
+ "sdot z26.s, z20.b, z16.b\n"
+ "sdot z29.s, z19.b, z16.b\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "sdot z11.s, z20.b, z3.b\n"
+ "sdot z25.s, z20.b, z16.b\n"
+ "sdot z1.s, z9.b, z16.b\n"
+ "sdot z26.s, z19.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "mls z29.s, p2/M, z28.s, z13.s\n"
+ "mov z22.s, #0x0\n"
+ "sdot z11.s, z19.b, z16.b\n"
+ "sdot z25.s, z19.b, z27.b\n"
+ "movprfx z18, z1\n sdot z18.s, z9.b, z27.b\n"
+ "sdot z1.s, z9.b, z8.b\n"
+ "sdot z22.s, z9.b, z30.b\n"
+ ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n"
+ "mls z26.s, p2/M, z24.s, z13.s\n"
+ "mls z11.s, p2/M, z1.s, z13.s\n"
+ "mov z10.s, #0x0\n"
+ "mls z25.s, p2/M, z18.s, z13.s\n"
+ "and z18.d, z29.d, z23.d\n"
+ ".inst 0x04a5775a // sqrdmulh z26.s, z26.s, z5.s\n"
+ "sdot z22.s, z9.b, z7.b\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04a5756b // sqrdmulh z11.s, z11.s, z5.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ "ld1w { z8.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "and z19.d, z26.d, z23.d\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ "and z18.d, z11.d, z23.d\n"
+ "movprfx z6, z22\n sdot z6.s, z9.b, z4.b\n"
+ "sdot z22.s, z9.b, z14.b\n"
+ "and z20.d, z25.d, z23.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z28.s, z28.s, z20.s\n"
- ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
- "ld1b { z13.b }, p2/Z, [%x[params]]\n"
- "sqadd z27.s, z27.s, z19.s\n"
- "sqadd z26.s, z26.s, z18.s\n"
- ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
- ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
- "add z23.s, z23.s, z16.s\n"
- "smax z23.s, p2/M, z23.s, z7.s\n"
- "add z28.s, z28.s, z16.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "add z27.s, z27.s, z16.s\n"
- "add z26.s, z26.s, z16.s\n"
- "smax z28.s, p2/M, z28.s, z7.s\n"
- "smax z27.s, p2/M, z27.s, z7.s\n"
- "mov z24.s, #0x0\n"
- "sdot z24.s, z25.b, z17.b\n"
- "smax z26.s, p2/M, z26.s, z7.s\n"
- "st1b { z23.s }, p0, [x12, x28]\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "st1b { z28.s }, p0, [x11, x28]\n"
- "mov z0.d, z1.d\n"
- "sdot z24.s, z25.b, z4.b\n"
- "st1b { z27.s }, p0, [x10, x28]\n"
- "mov z31.d, z1.d\n"
- "sdot z31.s, z21.b, z17.b\n"
- "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
- "st1b { z26.s }, p0, [x9, x28]\n"
- "mov z30.d, z1.d\n"
- "sdot z1.s, z21.b, z29.b\n"
- "sdot z1.s, z13.b, z17.b\n"
- "sdot z24.s, z25.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "sdot z0.s, z21.b, z29.b\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
- "mov z19.s, #0x0\n"
- "sdot z30.s, z21.b, z17.b\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z19.s, z25.b, z17.b\n"
- "sdot z31.s, z13.b, z4.b\n"
- "incw x28\n"
- "whilelt p1.s, x28, %x[n_channels]\n"
- "sdot z1.s, z20.b, z4.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "sdot z0.s, z13.b, z17.b\n"
+ ".inst 0x44828afd // srshl z29.s, p2/M, z29.s, z23.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z18.s\n"
+ "ld1b { z24.b }, p2/Z, [%x[params]]\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "add z29.s, z29.s, z21.s\n"
+ "sqadd z25.s, z25.s, z20.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ ".inst 0x44828aeb // srshl z11.s, p2/M, z11.s, z23.s\n"
+ ".inst 0x44828afa // srshl z26.s, p2/M, z26.s, z23.s\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ ".inst 0x44828af9 // srshl z25.s, p2/M, z25.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z26.s, z26.s, z21.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "smax z11.s, p2/M, z11.s, z31.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "st1b { z29.s }, p0, [x12, x13]\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "smin z11.s, p2/M, z11.s, z15.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z11.s }, p0, [x11, x13]\n"
+ "mov z28.d, z2.d\n"
+ "st1b { z26.s }, p0, [x10, x13]\n"
+ "mov z1.d, z2.d\n"
+ "st1b { z25.s }, p0, [x9, x13]\n"
+ "mov z3.d, z2.d\n"
+ "sdot z2.s, z18.b, z14.b\n"
+ "incw x13\n"
+ "sdot z1.s, z18.b, z30.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
"whilelt p0.b, x14, %x[n_channels]\n"
- "sdot z30.s, z13.b, z4.b\n"
- "sdot z19.s, z25.b, z4.b\n"
- "ld1b { z13.b }, p0/Z, [x26, x14]\n"
- "ld1b { z28.b }, p0/Z, [x25, x14]\n"
- "sdot z31.s, z20.b, z5.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "mls z1.s, p2/M, z24.s, z8.s\n"
- "ld1b { z27.b }, p0/Z, [x22, x14]\n"
- "sdot z0.s, z20.b, z4.b\n"
- "sdot z30.s, z20.b, z5.b\n"
- ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
- "ld1b { z26.b }, p0/Z, [x21, x14]\n"
- "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
- "sdot z19.s, z25.b, z29.b\n"
- "and z11.d, z1.d, z22.d\n"
- "ld1b { z29.b }, p0/Z, [x23, x14]\n"
- "mls z0.s, p2/M, z19.s, z8.s\n"
- "mls z31.s, p2/M, z23.s, z8.s\n"
- "asr z11.s, z11.s, #0x1f\n"
- "ld1b { z17.b }, p0/Z, [x20, x14]\n"
- "mls z30.s, p2/M, z18.s, z8.s\n"
- ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
- ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
- ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
- "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "sdot z2.s, z24.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z28.s, z18.b, z14.b\n"
+ "ld1b { z0.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x14]\n"
+ "sdot z3.s, z18.b, z30.b\n"
+ "sdot z10.s, z9.b, z30.b\n"
+ "sdot z1.s, z24.b, z7.b\n"
+ "sdot z2.s, z19.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ "sdot z28.s, z24.b, z30.b\n"
+ "ld1b { z30.b }, p0/Z, [x20, x14]\n"
+ "sdot z3.s, z24.b, z7.b\n"
+ "sdot z10.s, z9.b, z7.b\n"
+ "sdot z1.s, z19.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "mls z2.s, p2/M, z22.s, z13.s\n"
+ "sdot z28.s, z19.b, z7.b\n"
+ "sdot z3.s, z19.b, z4.b\n"
+ "movprfx z18, z10\n sdot z18.s, z9.b, z4.b\n"
+ "sdot z10.s, z9.b, z14.b\n"
+ "ld1b { z14.b }, p0/Z, [x25, x14]\n"
+ "mls z1.s, p2/M, z6.s, z13.s\n"
+ ".inst 0x04a87442 // sqrdmulh z2.s, z2.s, z8.s\n"
+ "mls z3.s, p2/M, z18.s, z13.s\n"
+ "and z18.d, z2.d, z23.d\n"
+ "mls z28.s, p2/M, z10.s, z13.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04a87421 // sqrdmulh z1.s, z1.s, z8.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a87463 // sqrdmulh z3.s, z3.s, z8.s\n"
+ "ld1b { z12.b }, p0/Z, [x28, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x40]\n"
- "sqadd z1.s, z1.s, z11.s\n"
- "and z21.d, z0.d, z22.d\n"
- ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z22.d\n"
- "and z19.d, z30.d, z22.d\n"
- "ld1b { z18.b }, p0/Z, [x23, x14]\n"
- "ld1b { z11.b }, p0/Z, [x22, x14]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "ld1b { z24.b }, p0/Z, [x21, x14]\n"
- "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "sqadd z2.s, z2.s, z18.s\n"
+ "and z22.d, z1.d, z23.d\n"
+ "and z18.d, z28.d, z23.d\n"
+ "and z19.d, z3.d, z23.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ ".inst 0x44828ae2 // srshl z2.s, p2/M, z2.s, z23.s\n"
+ "ld1b { z11.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x20, x14]\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z19.s, z19.s, #0x1f\n"
- "sqadd z0.s, z0.s, z21.s\n"
- ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
- "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z30.s, z30.s, z19.s\n"
- ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
- ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
- "add z1.s, z1.s, z16.s\n"
- "smax z1.s, p2/M, z1.s, z7.s\n"
- "add z0.s, z0.s, z16.s\n"
- "ld1b { z9.b }, p0/Z, [x24, x14]\n"
- "add z31.s, z31.s, z16.s\n"
- "add z30.s, z30.s, z16.s\n"
+ "sqadd z1.s, z1.s, z22.s\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add z2.s, z2.s, z21.s\n"
+ "sqadd z28.s, z28.s, z18.s\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z3.s, z3.s, z19.s\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "smax z2.s, p2/M, z2.s, z31.s\n"
+ ".inst 0x44828afc // srshl z28.s, p2/M, z28.s, z23.s\n"
+ ".inst 0x44828ae3 // srshl z3.s, p2/M, z3.s, z23.s\n"
+ "ld1b { z5.b }, p0/Z, [x24, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "smin z1.s, p2/M, z1.s, z6.s\n"
- "smax z0.s, p2/M, z0.s, z7.s\n"
- "st1b { z1.s }, p1, [x12, x28]\n"
- "ld1b { z2.b }, p0/Z, [x23, x14]\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "add z1.s, z1.s, z21.s\n"
+ "smin z2.s, p2/M, z2.s, z15.s\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "add z28.s, z28.s, z21.s\n"
+ "add z3.s, z3.s, z21.s\n"
+ "ld1b { z6.b }, p0/Z, [x23, x14]\n"
"ld1b { z23.b }, p0/Z, [x22, x14]\n"
- "ld1b { z22.b }, p0/Z, [x21, x14]\n"
- "ld1b { z5.b }, p0/Z, [x20, x14]\n"
- "zip2 z20.b, z15.b, z28.b\n"
- "zip1 z15.b, z15.b, z28.b\n"
- "smin z0.s, p2/M, z0.s, z6.s\n"
- "zip1 z19.b, z13.b, z29.b\n"
- "zip2 z29.b, z13.b, z29.b\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "st1b { z0.s }, p1, [x11, x28]\n"
- "zip2 z13.b, z15.b, z19.b\n"
- "zip1 z15.b, z15.b, z19.b\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p1, [x10, x28]\n"
- "zip1 z14.b, z20.b, z29.b\n"
- "zip2 z29.b, z20.b, z29.b\n"
- "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z30.s }, p1, [x9, x28]\n"
- "zip2 z21.b, z9.b, z26.b\n"
- "zip1 z9.b, z9.b, z26.b\n"
- "incw x28\n"
- "zip1 z20.b, z27.b, z17.b\n"
- "zip2 z17.b, z27.b, z17.b\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z18.b, z24.b\n"
- "zip1 z18.b, z18.b, z24.b\n"
+ "smax z1.s, p2/M, z1.s, z31.s\n"
+ "st1b { z2.s }, p1, [x12, x13]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "zip2 z19.b, z12.b, z27.b\n"
+ "zip1 z12.b, z12.b, z27.b\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "smax z3.s, p2/M, z3.s, z31.s\n"
+ "zip1 z18.b, z0.b, z14.b\n"
+ "zip2 z14.b, z0.b, z14.b\n"
+ "smin z1.s, p2/M, z1.s, z15.s\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z27.b, z11.b, z4.b\n"
- "zip2 z4.b, z11.b, z4.b\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z3.s, p2/M, z3.s, z15.s\n"
+ "zip2 z2.b, z12.b, z18.b\n"
+ "zip1 z12.b, z12.b, z18.b\n"
+ "zip1 z8.b, z19.b, z14.b\n"
+ "zip2 z14.b, z19.b, z14.b\n"
+ "ld1w { z0.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z30.b, z2.b, z22.b\n"
- "zip1 z2.b, z2.b, z22.b\n"
- "zip1 z28.b, z23.b, z5.b\n"
- "zip2 z5.b, z23.b, z5.b\n"
- "zip2 z19.b, z9.b, z20.b\n"
- "zip1 z9.b, z9.b, z20.b\n"
- "zip1 z11.b, z21.b, z17.b\n"
- "zip2 z17.b, z21.b, z17.b\n"
- "zip2 z12.b, z18.b, z27.b\n"
- "zip1 z18.b, z18.b, z27.b\n"
- "zip1 z20.b, z31.b, z4.b\n"
- "zip2 z4.b, z31.b, z4.b\n"
- "zip2 z24.b, z2.b, z28.b\n"
- "zip1 z2.b, z2.b, z28.b\n"
- "zip1 z0.b, z30.b, z5.b\n"
- "zip2 z5.b, z30.b, z5.b\n"
- "mov z22.d, z10.d\n"
- "mov z31.d, z10.d\n"
- "mov z21.d, z10.d\n"
+ "st1b { z28.s }, p1, [x11, x13]\n"
+ "zip2 z27.b, z5.b, z25.b\n"
+ "zip1 z5.b, z5.b, z25.b\n"
+ "st1b { z1.s }, p1, [x10, x13]\n"
+ "zip1 z18.b, z26.b, z30.b\n"
+ "zip2 z30.b, z26.b, z30.b\n"
+ "st1b { z3.s }, p1, [x9, x13]\n"
+ "zip2 z19.b, z29.b, z11.b\n"
+ "zip1 z29.b, z29.b, z11.b\n"
+ "incw x13\n"
+ "zip1 z28.b, z24.b, z7.b\n"
+ "zip2 z7.b, z24.b, z7.b\n"
+ "zip2 z25.b, z6.b, z22.b\n"
+ "zip1 z6.b, z6.b, z22.b\n"
+ "zip1 z22.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "zip2 z23.b, z5.b, z18.b\n"
+ "zip1 z5.b, z5.b, z18.b\n"
+ "zip1 z3.b, z27.b, z30.b\n"
+ "zip2 z30.b, z27.b, z30.b\n"
+ "zip2 z11.b, z29.b, z28.b\n"
+ "zip1 z29.b, z29.b, z28.b\n"
+ "zip1 z16.b, z19.b, z7.b\n"
+ "zip2 z7.b, z19.b, z7.b\n"
+ "zip2 z1.b, z6.b, z22.b\n"
+ "zip1 z6.b, z6.b, z22.b\n"
+ "zip1 z27.b, z25.b, z4.b\n"
+ "zip2 z4.b, z25.b, z4.b\n"
+ "mov z26.d, z0.d\n"
+ "mov z25.d, z0.d\n"
+ "mov z28.d, z0.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 8ac522dc9a..08ef1d3aeb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,316 +91,316 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x16, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x17, #0x0\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x16\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z12.b }, p4/Z, [x21]\n"
- "ld1rb { z30.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z24.h }, p4/Z, [x22]\n"
- "ld1rh { z11.h }, p4/Z, [x21]\n"
- "ld1rh { z26.h }, p4/Z, [x20]\n"
- "ldp x13, x12, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x16, x15\n"
- "ldp x11, x10, [x24, #0x10]\n"
- "whilelt p2.s, x16, x15\n"
- "whilelt p1.s, x23, x15\n"
- "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z14.h }, p4/Z, [x14]\n"
- "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "add x28, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x27, #0x0\n"
- "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x12, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x17\n"
+ "add x20, x26, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x26, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x26, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x26, %[offsetof_Requantize32_minval]\n"
+ "add x20, x26, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z15.b }, p4/Z, [x23]\n"
+ "ld1rh { z26.h }, p4/Z, [x22]\n"
+ "ld1rh { z2.h }, p4/Z, [x21]\n"
+ "ld1rh { z14.h }, p4/Z, [x20]\n"
+ "incw x24\n"
+ "whilelt p3.h, x17, x15\n"
+ "ldp x9, x28, [x16, #0x0]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "whilelt p2.s, x17, x15\n"
+ "whilelt p1.s, x24, x15\n"
+ "ld1sb { z13.h }, p4/Z, [x14]\n"
+ "ld1sb { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
- "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
- "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1sb { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1sb { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
- "addvl x9, x9, #2\n"
- "mov z17.d, z5.d\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z25.d, z9.d\n"
- "mov z16.d, z5.d\n"
- "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
- "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
- "mov z23.d, z9.d\n"
- "mov z22.d, z5.d\n"
- "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
- "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
- "mov z27.d, z9.d\n"
- ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
- "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x9, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454f11ad // ssublb z13.h, z13.b, z15.b\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "ld1w { z24.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454f116b // ssublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1252 // ssublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ "ld1sb { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1294 // ssublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z19.s, z24.s\n"
+ "uzp2 z16.s, z19.s, z24.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1231 // ssublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1sb { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a10e7 // ssublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1084 // ssublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1021 // ssublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a137b // ssublb z27.h, z27.b, z10.b\n"
"1:" // Loop
- ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
- "ldr x20, [x28, #0x28]\n"
- "ldr x21, [x28, #0x38]\n"
- ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
- ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
- "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x30]\n"
- ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
- ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
- "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
- ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
- "ldr x21, [x28, #0x40]\n"
- "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- "ldr x20, [x28, #0x48]\n"
- ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
- ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
- "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
- ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
- ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
- "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
- ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
- ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
- "ldr x21, [x28, #0x50]\n"
- "ldr x20, [x28, #0x58]\n"
- ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
- ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
- ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
- "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
- ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
- ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
- "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
- "ldr x21, [x28, #0x60]\n"
- ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
- ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
- "ldr x20, [x28, #0x68]\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
- ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
- ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
- ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
- "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x70]\n"
- ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
- ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
- ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
- ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
- "ldr x20, [x28, #0x78]\n"
- ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
- ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
- ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
- "whilelt p0.h, x27, x15\n"
- ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
- ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
- "ld1w { z20.s }, p2/Z, [x26]\n"
+ ".inst 0x449440e3 // smlalb z3.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x449444f0 // smlalt z16.s, p4/M, z7.h, z20.h\n"
+ "ldr x25, [x13, #0x28]\n"
+ "ldr x24, [x13, #0x38]\n"
+ ".inst 0x448640e8 // smlalb z8.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x448b40e0 // smlalb z0.s, p4/M, z7.h, z11.h\n"
+ "ldr x23, [x13, #0x30]\n"
+ "ldr x22, [x13, #0x40]\n"
+ ".inst 0x448d40f3 // smlalb z19.s, p4/M, z7.h, z13.h\n"
+ ".inst 0x448644f5 // smlalt z21.s, p4/M, z7.h, z6.h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "ld1sb { z22.h }, p3/Z, [x25, x17]\n"
+ ".inst 0x448b44fd // smlalt z29.s, p4/M, z7.h, z11.h\n"
+ ".inst 0x448d44e9 // smlalt z9.s, p4/M, z7.h, z13.h\n"
+ "ld1sb { z31.h }, p3/Z, [x24, x17]\n"
+ ".inst 0x448d4303 // smlalb z3.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1sb { z25.h }, p3/Z, [x22, x17]\n"
+ ".inst 0x44924088 // smlalb z8.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x44924020 // smlalb z0.s, p4/M, z1.h, z18.h\n"
+ "ld1sb { z23.h }, p3/Z, [x20, x17]\n"
+ "ldr x20, [x13, #0x58]\n"
+ ".inst 0x448b4033 // smlalb z19.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44924495 // smlalt z21.s, p4/M, z4.h, z18.h\n"
+ "ld1sb { z12.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x4492443d // smlalt z29.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448b4429 // smlalt z9.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a13ff // ssublb z31.h, z31.b, z10.b\n"
+ "ldr x21, [x13, #0x60]\n"
+ ".inst 0x449e4023 // smlalb z3.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x449e4430 // smlalt z16.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ld1sb { z4.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x44944028 // smlalb z8.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x449c42c0 // smlalb z0.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ "ldr x20, [x13, #0x68]\n"
+ ".inst 0x44864373 // smlalb z19.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44944435 // smlalt z21.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x454a12f7 // ssublb z23.h, z23.b, z10.b\n"
+ "ld1sb { z7.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x449c46dd // smlalt z29.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x454a118c // ssublb z12.h, z12.b, z10.b\n"
+ "ldr x21, [x13, #0x70]\n"
+ ".inst 0x44914363 // smlalb z3.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x44914770 // smlalt z16.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x454a1084 // ssublb z4.h, z4.b, z10.b\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x449c4368 // smlalb z8.s, p4/M, z27.h, z28.h\n"
+ ".inst 0x44944360 // smlalb z0.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x454a10e7 // ssublb z7.h, z7.b, z10.b\n"
+ "ldr x20, [x13, #0x78]\n"
+ ".inst 0x44854313 // smlalb z19.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x449c4775 // smlalt z21.s, p4/M, z27.h, z28.h\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x17]\n"
+ "whilelt p0.h, x12, x15\n"
+ ".inst 0x4494477d // smlalt z29.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x44854709 // smlalt z9.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
+ ".inst 0x448b43e3 // smlalb z3.s, p4/M, z31.h, z11.h\n"
+ ".inst 0x448b47f0 // smlalt z16.s, p4/M, z31.h, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x11, #1, MUL VL]\n"
"inch x14\n"
- ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
- ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
- "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x448d43e8 // smlalb z8.s, p4/M, z31.h, z13.h\n"
+ ".inst 0x449e42e0 // smlalb z0.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x454a1021 // ssublb z1.h, z1.b, z10.b\n"
"ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
- ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
- "addvl x26, x26, #2\n"
- ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
- ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
- ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
- ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
- "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
- ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
- "uzp1 z2.s, z20.s, z15.s\n"
- "inch x16\n"
- ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
- ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
- "uzp2 z15.s, z20.s, z15.s\n"
- "ld1w { z20.s }, p2/Z, [x25]\n"
- ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
- "mov x20, x16\n"
+ ".inst 0x449442f3 // smlalb z19.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x448d47f5 // smlalt z21.s, p4/M, z31.h, z13.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x17]\n"
+ "inch x17\n"
+ ".inst 0x449e46fd // smlalt z29.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x449446e9 // smlalt z9.s, p4/M, z23.h, z20.h\n"
+ "uzp1 z20.s, z24.s, z27.s\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x44924323 // smlalb z3.s, p4/M, z25.h, z18.h\n"
+ ".inst 0x44924730 // smlalt z16.s, p4/M, z25.h, z18.h\n"
+ "uzp2 z24.s, z24.s, z27.s\n"
+ "ld1w { z27.s }, p2/Z, [x10]\n"
+ ".inst 0x448b4328 // smlalb z8.s, p4/M, z25.h, z11.h\n"
+ ".inst 0x448d4180 // smlalb z0.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x454a13ff // ssublb z31.h, z31.b, z10.b\n"
+ "mov x20, x17\n"
+ ".inst 0x44924093 // smlalb z19.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x448b4735 // smlalt z21.s, p4/M, z25.h, z11.h\n"
+ "ld1w { z25.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "whilelt p2.s, x17, x15\n"
+ ".inst 0x448d459d // smlalt z29.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x44924489 // smlalt z9.s, p4/M, z4.h, z18.h\n"
+ "addvl x10, x10, #2\n"
+ ".inst 0x448542e3 // smlalb z3.s, p4/M, z23.h, z5.h\n"
+ ".inst 0x448546f0 // smlalt z16.s, p4/M, z23.h, z5.h\n"
"incw x20\n"
- ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
- ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
- "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
- "uzp1 z21.s, z20.s, z19.s\n"
- ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
- "uzp2 z1.s, z20.s, z19.s\n"
- "whilelt p2.s, x16, x15\n"
- ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449142e8 // smlalb z8.s, p4/M, z23.h, z17.h\n"
+ ".inst 0x448640e0 // smlalb z0.s, p4/M, z7.h, z6.h\n"
+ "uzp1 z11.s, z27.s, z25.s\n"
+ ".inst 0x449e42d3 // smlalb z19.s, p4/M, z22.h, z30.h\n"
+ ".inst 0x449146f5 // smlalt z21.s, p4/M, z23.h, z17.h\n"
+ "uzp2 z27.s, z27.s, z25.s\n"
+ ".inst 0x448644fd // smlalt z29.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x449e46c9 // smlalt z9.s, p4/M, z22.h, z30.h\n"
"whilelt p1.s, x20, x15\n"
- "whilelt p3.h, x16, x15\n"
- ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
- ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
- ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
- "addvl x25, x25, #2\n"
- ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
- ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
- "and z19.d, z5.d, z21.d\n"
- ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
- ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
- "sqadd z5.s, z5.s, z19.s\n"
- ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
- ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
- ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
- ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
- ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
- "and z29.d, z9.d, z1.d\n"
- ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
- ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
- ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
- ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
- ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
- ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
- ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
- ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
- "asr z29.s, z29.s, #0x1f\n"
- "and z18.d, z17.d, z21.d\n"
- ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
- "and z20.d, z16.d, z21.d\n"
- ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
- "and z19.d, z22.d, z21.d\n"
- ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
- "sqadd z9.s, z9.s, z29.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z7.d, z25.d, z1.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z6.d, z23.d, z1.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z27.d, z1.d\n"
- "sqadd z17.s, z17.s, z18.s\n"
- "asr z7.s, z7.s, #0x1f\n"
- ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
- "sqadd z16.s, z16.s, z20.s\n"
+ "whilelt p3.h, x17, x15\n"
+ ".inst 0x44864183 // smlalb z3.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x44864590 // smlalt z16.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x449e4088 // smlalb z8.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x44914020 // smlalb z0.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4033 // smlalb z19.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449e4495 // smlalt z21.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x4491443d // smlalt z29.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4429 // smlalt z9.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c40e3 // smlalb z3.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x449c44f0 // smlalt z16.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x448542c8 // smlalb z8.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448543e0 // smlalb z0.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449143f3 // smlalb z19.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x448546d5 // smlalt z21.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448547fd // smlalt z29.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449147e9 // smlalt z9.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x04b47463 // sqrdmulh z3.s, z3.s, z20.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ ".inst 0x04b47400 // sqrdmulh z0.s, z0.s, z20.s\n"
+ "and z4.d, z3.d, z11.d\n"
+ ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ "and z13.d, z16.d, z27.d\n"
+ "and z6.d, z8.d, z11.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z7.d, z0.d, z11.d\n"
+ ".inst 0x04b877bd // sqrdmulh z29.s, z29.s, z24.s\n"
+ ".inst 0x04b87529 // sqrdmulh z9.s, z9.s, z24.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
- ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
- "sqadd z22.s, z22.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
- "sqadd z25.s, z25.s, z7.s\n"
- "sqadd z23.s, z23.s, z6.s\n"
- ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
- "sqadd z27.s, z27.s, z2.s\n"
- ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
- ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
- ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
- ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
- ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
- ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
- ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
- ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
- "sqadd z5.h, z5.h, z24.h\n"
- "smax z5.h, p4/M, z5.h, z11.h\n"
- "smin z5.h, p4/M, z5.h, z26.h\n"
- "sqadd z17.h, z17.h, z24.h\n"
- "sqadd z16.h, z16.h, z24.h\n"
- "smax z17.h, p4/M, z17.h, z11.h\n"
- "smax z16.h, p4/M, z16.h, z11.h\n"
- "sqadd z22.h, z22.h, z24.h\n"
- "smax z22.h, p4/M, z22.h, z11.h\n"
- "smin z17.h, p4/M, z17.h, z26.h\n"
- "st1b { z5.h }, p0, [x13, x27]\n"
- "smin z16.h, p4/M, z16.h, z26.h\n"
- "smin z22.h, p4/M, z22.h, z26.h\n"
- "st1b { z17.h }, p0, [x12, x27]\n"
- "st1b { z16.h }, p0, [x11, x27]\n"
- "st1b { z22.h }, p0, [x10, x27]\n"
- "ld1sb { z14.h }, p4/Z, [x14]\n"
- "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "inch x27\n"
- "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "sqadd z3.s, z3.s, z4.s\n"
+ "and z20.d, z19.d, z11.d\n"
+ "and z18.d, z21.d, z27.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z13.s\n"
+ "and z13.d, z29.d, z27.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z23.d, z9.d, z27.d\n"
+ ".inst 0x44829163 // srshl z3.s, p4/M, z3.s, z11.s\n"
+ "sqadd z8.s, z8.s, z6.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z7.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ ".inst 0x44829168 // srshl z8.s, p4/M, z8.s, z11.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ ".inst 0x45304063 // sqxtnb z3.h, z3.s\n"
+ ".inst 0x44829160 // srshl z0.s, p4/M, z0.s, z11.s\n"
+ "sqadd z29.s, z29.s, z13.s\n"
+ ".inst 0x44829173 // srshl z19.s, p4/M, z19.s, z11.s\n"
+ "sqadd z9.s, z9.s, z23.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x44829375 // srshl z21.s, p4/M, z21.s, z27.s\n"
+ ".inst 0x45304000 // sqxtnb z0.h, z0.s\n"
+ ".inst 0x45304603 // sqxtnt z3.h, z16.s\n"
+ ".inst 0x4482937d // srshl z29.s, p4/M, z29.s, z27.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x45304273 // sqxtnb z19.h, z19.s\n"
+ ".inst 0x453046a8 // sqxtnt z8.h, z21.s\n"
+ ".inst 0x453047a0 // sqxtnt z0.h, z29.s\n"
+ ".inst 0x45304533 // sqxtnt z19.h, z9.s\n"
+ "sqadd z3.h, z3.h, z26.h\n"
+ "sqadd z8.h, z8.h, z26.h\n"
+ "sqadd z0.h, z0.h, z26.h\n"
+ "sqadd z19.h, z19.h, z26.h\n"
+ "smax z3.h, p4/M, z3.h, z2.h\n"
+ "smax z8.h, p4/M, z8.h, z2.h\n"
+ "smax z0.h, p4/M, z0.h, z2.h\n"
+ "smax z19.h, p4/M, z19.h, z2.h\n"
+ "smin z3.h, p4/M, z3.h, z14.h\n"
+ "smin z8.h, p4/M, z8.h, z14.h\n"
+ "smin z0.h, p4/M, z0.h, z14.h\n"
+ "smin z19.h, p4/M, z19.h, z14.h\n"
+ "st1b { z3.h }, p0, [x9, x12]\n"
+ "st1b { z8.h }, p0, [x28, x12]\n"
+ "st1b { z0.h }, p0, [x27, x12]\n"
+ "st1b { z19.h }, p0, [x26, x12]\n"
+ "inch x12\n"
+ "ld1sb { z13.h }, p4/Z, [x14]\n"
+ "ld1sb { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
- "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
- "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1sb { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1sb { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
+ ".inst 0x454f11ad // ssublb z13.h, z13.b, z15.b\n"
+ "ld1w { z1.s }, p2/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
+ ".inst 0x454f116b // ssublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1252 // ssublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ "ld1sb { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1294 // ssublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z1.s, z0.s\n"
+ "uzp2 z16.s, z1.s, z0.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z17.d, z5.d\n"
- "mov z25.d, z9.d\n"
- "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
- "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
- "mov z16.d, z5.d\n"
- "mov z23.d, z9.d\n"
- "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
- "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
- "mov z22.d, z5.d\n"
- "mov z27.d, z9.d\n"
- "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
- ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
- ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1231 // ssublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1sb { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a10e7 // ssublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1084 // ssublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1021 // ssublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a137b // ssublb z27.h, z27.b, z10.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index fc9a48bb46..f00e1aecaf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,348 +100,348 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x7, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x8, #0x0\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x7\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z26.b }, p4/Z, [x21]\n"
- "ld1rb { z13.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z19.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x8\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z25.h }, p4/Z, [x22]\n"
+ "ld1rh { z14.h }, p4/Z, [x21]\n"
"ld1rh { z9.h }, p4/Z, [x20]\n"
- "ldp x16, x15, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x7, x8\n"
- "ldp x14, x13, [x24, #0x10]\n"
- "whilelt p2.s, x7, x8\n"
- "whilelt p1.s, x23, x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z25.h }, p4/Z, [x17]\n"
- "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
- "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x12]\n"
- "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1sb { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "mov z18.d, z8.d\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z0.d, z24.d\n"
- "mov z15.d, z8.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
- "mov z1.d, z24.d\n"
- "mov z5.d, z8.d\n"
- "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
- "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
- "mov z6.d, z24.d\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
- "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
- ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
- "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
- "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "incw x24\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x11, x10, [x26, #0x0]\n"
+ "ldp x9, x28, [x26, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x24, x17\n"
+ "ld1sb { z28.h }, p4/Z, [x16]\n"
+ "ld1sb { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1sb { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1sb { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1w { z11.s }, p2/Z, [x25]\n"
+ "ld1w { z4.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ld1sb { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z11.s, z4.s\n"
+ "uzp2 z11.s, z11.s, z4.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c12b5 // ssublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1sb { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1sb { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1sb { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1sb { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1sb { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511273 // ssublb z19.h, z19.b, z17.b\n"
"1:" // Loop
- ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
- "ldr x21, [x11, #0x58]\n"
- "ldr x20, [x11, #0x78]\n"
- ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
- ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
- "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
- "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
- "ldr x21, [x11, #0x60]\n"
- "ldr x20, [x11, #0x80]\n"
- ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
- ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
- ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
- ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
- ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
- "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
- ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
- ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
- "ldr x21, [x11, #0x68]\n"
- ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
- "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
- "ldr x20, [x11, #0x88]\n"
- ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
- ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
- ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
- ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
- "ldr x22, [x11, #0x40]\n"
- ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
- ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
- ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
- "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
- "ldr x20, [x11, #0x98]\n"
- ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
- "ldr x23, [x11, #0x50]\n"
- ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
- ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
- "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
- ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
- ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
- ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
- "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
- "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
- "ldr x22, [x11, #0x48]\n"
- ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
- "ldr x21, [x11, #0x90]\n"
- "ldr x20, [x11, #0xa8]\n"
- ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
- "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
- ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
- ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
- ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
- ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
- ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
- "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
- ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
- ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
- "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
- "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
- ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
- "ldr x21, [x11, #0xa0]\n"
- "ldr x20, [x11, #0xb0]\n"
- ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
- ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
- ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
- ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
- ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
- "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
- ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
- "ldr x20, [x11, #0xb8]\n"
- ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
- ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
- ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
- ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
- "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
- ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
- ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
- "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
- ".inst 0x455a13de // ssublb z30.h, z30.b, z26.b\n"
- ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
- ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
- "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
- "uzp1 z10.s, z17.s, z14.s\n"
- ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
- "uzp2 z14.s, z17.s, z14.s\n"
- "ld1w { z17.s }, p2/Z, [x28]\n"
- ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
- ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
- "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
- ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
- ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
- ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
- "uzp1 z4.s, z17.s, z16.s\n"
- "inch x7\n"
- ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
- ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
- ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
- ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
- "uzp2 z22.s, z17.s, z16.s\n"
- "mov x20, x7\n"
- ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
- ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
- "and z17.d, z8.d, z4.d\n"
- "inch x17\n"
- ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
- ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
- ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
- "incw x20\n"
- ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
- ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- "whilelt p2.s, x7, x8\n"
- ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
- "and z16.d, z24.d, z22.d\n"
- "whilelt p1.s, x20, x8\n"
- ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ ".inst 0x448f4065 // smlalb z5.s, p4/M, z3.h, z15.h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x78]\n"
+ ".inst 0x448f446b // smlalt z11.s, p4/M, z3.h, z15.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x80]\n"
+ ".inst 0x449a407e // smlalb z30.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448d4064 // smlalb z4.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c407f // smlalb z31.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449a4470 // smlalt z16.s, p4/M, z3.h, z26.h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "ld1sb { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1sb { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d4468 // smlalt z8.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c446a // smlalt z10.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c43a5 // smlalb z5.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x449c47ab // smlalt z11.s, p4/M, z29.h, z28.h\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x8]\n"
+ "ld1sb { z3.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x4494401e // smlalb z30.s, p4/M, z0.h, z20.h\n"
+ "ldr x25, [x15, #0x40]\n"
+ "ldr x24, [x15, #0x70]\n"
+ "whilelt p0.h, x14, x17\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ ".inst 0x455110e7 // ssublb z7.h, z7.b, z17.b\n"
+ ".inst 0x44944410 // smlalt z16.s, p4/M, z0.h, z20.h\n"
+ "ld1sb { z0.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ "ldr x23, [x15, #0x98]\n"
+ "ldr x22, [x15, #0x50]\n"
+ ".inst 0x449442e5 // smlalb z5.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x449446eb // smlalt z11.s, p4/M, z23.h, z20.h\n"
+ "ld1sb { z23.h }, p3/Z, [x20, x8]\n"
+ "ldr x21, [x15, #0x48]\n"
+ ".inst 0x44924024 // smlalb z4.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448640ff // smlalb z31.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ "ldr x20, [x15, #0x90]\n"
+ ".inst 0x44924428 // smlalt z8.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448644ea // smlalt z10.s, p4/M, z7.h, z6.h\n"
+ "ld1sb { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1sb { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d431e // smlalb z30.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x449242c5 // smlalb z5.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246cb // smlalt z11.s, p4/M, z22.h, z18.h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "ld1sb { z22.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x449c43a4 // smlalb z4.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494407f // smlalb z31.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ "ldr x23, [x15, #0xa0]\n"
+ ".inst 0x449c47a8 // smlalt z8.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494446a // smlalt z10.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x455110e7 // ssublb z7.h, z7.b, z17.b\n"
+ "ldr x22, [x15, #0xb0]\n"
+ ".inst 0x449c427e // smlalb z30.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x449c4670 // smlalt z16.s, p4/M, z19.h, z28.h\n"
+ "ld1sb { z28.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44864365 // smlalb z5.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476b // smlalt z11.s, p4/M, z27.h, z6.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ ".inst 0x44864004 // smlalb z4.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x448242ff // smlalb z31.s, p4/M, z23.h, z2.h\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ ".inst 0x44864408 // smlalt z8.s, p4/M, z0.h, z6.h\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n"
+ ".inst 0x4486403e // smlalb z30.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ "ld1sb { z23.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x44864430 // smlalt z16.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x448d4265 // smlalb z5.s, p4/M, z19.h, z13.h\n"
+ ".inst 0x448d466b // smlalt z11.s, p4/M, z19.h, z13.h\n"
+ "ld1sb { z6.h }, p3/Z, [x22, x8]\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x449440e4 // smlalb z4.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d431f // smlalb z31.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ "ld1w { z19.s }, p2/Z, [x13]\n"
+ ".inst 0x449444e8 // smlalt z8.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d470a // smlalt z10.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ "ld1w { z20.s }, p1/Z, [x13, #1, MUL VL]\n"
+ ".inst 0x4482439e // smlalb z30.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x455110c6 // ssublb z6.h, z6.b, z17.b\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ld1sb { z13.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448242c5 // smlalb z5.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x448246cb // smlalt z11.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x45511021 // ssublb z1.h, z1.b, z17.b\n"
+ "inch x8\n"
+ ".inst 0x449a4364 // smlalb z4.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492401f // smlalb z31.s, p4/M, z0.h, z18.h\n"
+ "uzp1 z28.s, z19.s, z20.s\n"
+ "inch x16\n"
+ ".inst 0x449a4768 // smlalt z8.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492440a // smlalt z10.s, p4/M, z0.h, z18.h\n"
+ "uzp2 z20.s, z19.s, z20.s\n"
+ "ld1w { z27.s }, p2/Z, [x12]\n"
+ ".inst 0x449242de // smlalb z30.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246d0 // smlalt z16.s, p4/M, z22.h, z18.h\n"
+ "ld1w { z19.s }, p1/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x455111ad // ssublb z13.h, z13.b, z17.b\n"
+ ".inst 0x449a43a5 // smlalb z5.s, p4/M, z29.h, z26.h\n"
+ ".inst 0x449a47ab // smlalt z11.s, p4/M, z29.h, z26.h\n"
+ "mov x21, x8\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x449542e4 // smlalb z4.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449540df // smlalb z31.s, p4/M, z6.h, z21.h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
- "whilelt p3.h, x7, x8\n"
- "addvl x9, x9, #2\n"
- ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
- "sqadd z8.s, z8.s, z17.s\n"
- ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
- "addvl x28, x28, #2\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z21.d, z18.d, z4.d\n"
- ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
- "and z20.d, z15.d, z4.d\n"
- ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
- "and z28.d, z5.d, z4.d\n"
- ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z25.d, z0.d, z22.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z17.d, z1.d, z22.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "and z16.d, z6.d, z22.d\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "asr z25.s, z25.s, #0x1f\n"
- ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
- "sqadd z15.s, z15.s, z20.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
- "sqadd z5.s, z5.s, z28.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
- "sqadd z0.s, z0.s, z25.s\n"
- "sqadd z1.s, z1.s, z17.s\n"
- ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
- "sqadd z6.s, z6.s, z16.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
- ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
- ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
- ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ "addvl x13, x13, #2\n"
+ ".inst 0x449546e8 // smlalt z8.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449544ca // smlalt z10.s, p4/M, z6.h, z21.h\n"
+ "uzp1 z23.s, z27.s, z19.s\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x4495407e // smlalb z30.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44954470 // smlalt z16.s, p4/M, z3.h, z21.h\n"
+ "uzp2 z6.s, z27.s, z19.s\n"
+ "incw x21\n"
+ ".inst 0x449540e5 // smlalb z5.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x449544eb // smlalt z11.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x44824004 // smlalb z4.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a403f // smlalb z31.s, p4/M, z1.h, z26.h\n"
+ ".inst 0x44824408 // smlalt z8.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a442a // smlalt z10.s, p4/M, z1.h, z26.h\n"
+ "whilelt p1.s, x21, x17\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x448f431e // smlalb z30.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x448f4710 // smlalt z16.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x04bc74a5 // sqrdmulh z5.s, z5.s, z28.s\n"
+ ".inst 0x04b4756b // sqrdmulh z11.s, z11.s, z20.s\n"
+ ".inst 0x448f4024 // smlalb z4.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f41bf // smlalb z31.s, p4/M, z13.h, z15.h\n"
+ "and z24.d, z5.d, z23.d\n"
+ ".inst 0x448f4428 // smlalt z8.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f45aa // smlalt z10.s, p4/M, z13.h, z15.h\n"
+ "and z19.d, z11.d, z6.d\n"
+ ".inst 0x04bc77de // sqrdmulh z30.s, z30.s, z28.s\n"
+ ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ ".inst 0x04bc7484 // sqrdmulh z4.s, z4.s, z28.s\n"
+ ".inst 0x04bc77ff // sqrdmulh z31.s, z31.s, z28.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z7.d, z30.d, z23.d\n"
+ "sqadd z5.s, z5.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ "and z15.d, z4.d, z23.d\n"
+ "and z24.d, z31.d, z23.d\n"
+ ".inst 0x04b4754a // sqrdmulh z10.s, z10.s, z20.s\n"
+ "sqadd z11.s, z11.s, z19.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z18.d, z16.d, z6.d\n"
+ ".inst 0x448292e5 // srshl z5.s, p4/M, z5.s, z23.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "and z13.d, z8.d, z6.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "and z3.d, z10.d, z6.d\n"
+ ".inst 0x448290cb // srshl z11.s, p4/M, z11.s, z6.s\n"
+ "sqadd z30.s, z30.s, z7.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z15.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z24.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292fe // srshl z30.s, p4/M, z30.s, z23.s\n"
+ "sqadd z16.s, z16.s, z18.s\n"
".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
- ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
- ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
- ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
- "sqadd z8.h, z8.h, z19.h\n"
- "smax z8.h, p4/M, z8.h, z12.h\n"
- "smin z8.h, p4/M, z8.h, z9.h\n"
- "sqadd z18.h, z18.h, z19.h\n"
- "sqadd z15.h, z15.h, z19.h\n"
- "smax z18.h, p4/M, z18.h, z12.h\n"
- "smax z15.h, p4/M, z15.h, z12.h\n"
- "sqadd z5.h, z5.h, z19.h\n"
- "smax z5.h, p4/M, z5.h, z12.h\n"
- "smin z18.h, p4/M, z18.h, z9.h\n"
- "st1b { z8.h }, p0, [x16, x10]\n"
- "smin z15.h, p4/M, z15.h, z9.h\n"
+ ".inst 0x448292e4 // srshl z4.s, p4/M, z4.s, z23.s\n"
+ "sqadd z8.s, z8.s, z13.s\n"
+ ".inst 0x448292ff // srshl z31.s, p4/M, z31.s, z23.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x453043de // sqxtnb z30.h, z30.s\n"
+ ".inst 0x448290d0 // srshl z16.s, p4/M, z16.s, z6.s\n"
+ ".inst 0x45304084 // sqxtnb z4.h, z4.s\n"
+ ".inst 0x45304565 // sqxtnt z5.h, z11.s\n"
+ ".inst 0x448290c8 // srshl z8.s, p4/M, z8.s, z6.s\n"
+ ".inst 0x448290ca // srshl z10.s, p4/M, z10.s, z6.s\n"
+ ".inst 0x453043ff // sqxtnb z31.h, z31.s\n"
+ ".inst 0x4530461e // sqxtnt z30.h, z16.s\n"
+ ".inst 0x45304504 // sqxtnt z4.h, z8.s\n"
+ ".inst 0x4530455f // sqxtnt z31.h, z10.s\n"
+ "sqadd z5.h, z5.h, z25.h\n"
+ "sqadd z30.h, z30.h, z25.h\n"
+ "sqadd z4.h, z4.h, z25.h\n"
+ "sqadd z31.h, z31.h, z25.h\n"
+ "smax z5.h, p4/M, z5.h, z14.h\n"
+ "smax z30.h, p4/M, z30.h, z14.h\n"
+ "smax z4.h, p4/M, z4.h, z14.h\n"
+ "smax z31.h, p4/M, z31.h, z14.h\n"
"smin z5.h, p4/M, z5.h, z9.h\n"
- "st1b { z18.h }, p0, [x15, x10]\n"
- "st1b { z15.h }, p0, [x14, x10]\n"
- "st1b { z5.h }, p0, [x13, x10]\n"
- "ld1sb { z25.h }, p4/Z, [x17]\n"
- "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
- "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1sb { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
+ "smin z30.h, p4/M, z30.h, z9.h\n"
+ "smin z4.h, p4/M, z4.h, z9.h\n"
+ "smin z31.h, p4/M, z31.h, z9.h\n"
+ "st1b { z5.h }, p0, [x11, x14]\n"
+ "st1b { z30.h }, p0, [x10, x14]\n"
+ "st1b { z4.h }, p0, [x9, x14]\n"
+ "st1b { z31.h }, p0, [x28, x14]\n"
+ "inch x14\n"
+ "ld1sb { z28.h }, p4/Z, [x16]\n"
+ "ld1sb { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1sb { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1sb { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20, #1, MUL VL]\n"
"addvl x20, x20, #2\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ld1sb { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z10.s, z1.s\n"
+ "uzp2 z11.s, z10.s, z1.s\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z18.d, z8.d\n"
- "mov z0.d, z24.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
- "mov z15.d, z8.d\n"
- "mov z1.d, z24.d\n"
- "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
- "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
- "mov z5.d, z8.d\n"
- "mov z6.d, z24.d\n"
- "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
- "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
- "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
- "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
- ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
- ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c12b5 // ssublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1sb { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1sb { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1sb { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1sb { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1sb { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1sb { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511063 // ssublb z3.h, z3.b, z17.b\n"
+ ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n"
+ ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511000 // ssublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n"
+ ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n"
+ ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511273 // ssublb z19.h, z19.b, z17.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 7ff724ddd8..726c127d87 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const int8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const int8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -112,533 +112,533 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"mov x2, #0x0\n"
- "mov x24, x2\n"
- "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "incw x24\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
- "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z30.b }, p4/Z, [x21]\n"
- "ld1rb { z10.b }, p4/Z, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x6, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x2\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z14.b }, p4/Z, [x20]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z10.h }, p4/Z, [x22]\n"
+ "incw x24\n"
"ld1rh { z15.h }, p4/Z, [x21]\n"
- "ld1rh { z12.h }, p4/Z, [x20]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
"ld1rh { z13.h }, p4/Z, [x20]\n"
- "ldp x5, x6, [x22, #0x0]\n"
"whilelt p3.h, x2, x3\n"
- "ldp x7, x8, [x22, #0x10]\n"
+ "ldp x17, x16, [x26, #0x0]\n"
+ "ldp x15, x14, [x26, #0x10]\n"
"whilelt p2.s, x2, x3\n"
"whilelt p1.s, x24, x3\n"
- "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
- "add x17, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z17.s }, p2/Z, [x10]\n"
- "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1sb { z26.h }, p4/Z, [x4]\n"
- "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
- "addvl x10, x10, #2\n"
- "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "mov x16, #0x0\n"
- "mov z6.d, z14.d\n"
- "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z18.d, z23.d\n"
- "mov z9.d, z14.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z20.d, z23.d\n"
- "mov z7.d, z14.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z1.d, z23.d\n"
- ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
- "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
- "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
- "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
- "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
- "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
- "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
- ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
- "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
- "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x10, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ "ld1w { z5.s }, p2/Z, [x25]\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ "ld1sb { z25.h }, p4/Z, [x4]\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z5.s, z16.s\n"
+ "uzp2 z30.s, z5.s, z16.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1sb { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1sb { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1sb { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1sb { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e135a // ssublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1210 // ssublb z16.h, z16.b, z14.b\n"
+ "ld1sb { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1318 // ssublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e10a5 // ssublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1252 // ssublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1273 // ssublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e116b // ssublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1294 // ssublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e13bd // ssublb z29.h, z29.b, z14.b\n"
"1:" // Loop
- ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
- ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
- "ldr x20, [x17, #0x50]\n"
- "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
- "ldr x20, [x17, #0x58]\n"
- ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
- ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
- ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
- "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
- "ldr x20, [x17, #0x60]\n"
- ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
- ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
- ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
- "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
- ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
- "ldr x20, [x17, #0x68]\n"
+ ".inst 0x44994346 // smlalb z6.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x4499475e // smlalt z30.s, p4/M, z26.h, z25.h\n"
+ "ldr x23, [x5, #0x50]\n"
+ "ldr x22, [x5, #0x58]\n"
+ ".inst 0x44994211 // smlalb z17.s, p4/M, z16.h, z25.h\n"
+ ".inst 0x44994315 // smlalb z21.s, p4/M, z24.h, z25.h\n"
+ "ldr x21, [x5, #0x60]\n"
+ "ld1sb { z0.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x449940a7 // smlalb z7.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994608 // smlalt z8.s, p4/M, z16.h, z25.h\n"
+ "ldr x20, [x5, #0x68]\n"
+ "ld1sb { z26.h }, p4/Z, [x4, #6, MUL VL]\n"
+ "ld1sb { z2.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x4499471b // smlalt z27.s, p4/M, z24.h, z25.h\n"
+ ".inst 0x449944a9 // smlalt z9.s, p4/M, z5.h, z25.h\n"
+ "ld1sb { z22.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x449c4206 // smlalb z6.s, p4/M, z16.h, z28.h\n"
+ ".inst 0x449c461e // smlalt z30.s, p4/M, z16.h, z28.h\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x449c4251 // smlalb z17.s, p4/M, z18.h, z28.h\n"
+ ".inst 0x449c40b5 // smlalb z21.s, p4/M, z5.h, z28.h\n"
+ "ld1sb { z16.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x449c4067 // smlalb z7.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n"
+ ".inst 0x449c4648 // smlalt z8.s, p4/M, z18.h, z28.h\n"
+ "ldr x20, [x5, #0x70]\n"
+ ".inst 0x449c44bb // smlalt z27.s, p4/M, z5.h, z28.h\n"
+ ".inst 0x449c4469 // smlalt z9.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e12d6 // ssublb z22.h, z22.b, z14.b\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x44844246 // smlalb z6.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x4484465e // smlalt z30.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44844271 // smlalb z17.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x44844075 // smlalb z21.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x454e1210 // ssublb z16.h, z16.b, z14.b\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44844047 // smlalb z7.s, p4/M, z2.h, z4.h\n"
+ ".inst 0x44844668 // smlalt z8.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ldr x20, [x5, #0x78]\n"
+ ".inst 0x4484447b // smlalt z27.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x44844449 // smlalt z9.s, p4/M, z2.h, z4.h\n"
+ "ld1sb { z18.h }, p4/Z, [x4]\n"
+ "ldr x22, [x5, #0x80]\n"
+ ".inst 0x44974266 // smlalb z6.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x4497467e // smlalt z30.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x454e1339 // ssublb z25.h, z25.b, z14.b\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974055 // smlalb z21.s, p4/M, z2.h, z23.h\n"
+ "ld1sb { z19.h }, p3/Z, [x20, x2]\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x4497445b // smlalt z27.s, p4/M, z2.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1sb { z23.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x449f4166 // smlalb z6.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x449f457e // smlalt z30.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x454e1273 // ssublb z19.h, z19.b, z14.b\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x449f4031 // smlalb z17.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x449f42d5 // smlalb z21.s, p4/M, z22.h, z31.h\n"
+ "ldr x23, [x5, #0x98]\n"
+ "ldr x22, [x5, #0xa0]\n"
+ ".inst 0x449f4287 // smlalb z7.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x449f4428 // smlalt z8.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x454e12f7 // ssublb z23.h, z23.b, z14.b\n"
+ "ld1sb { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x449f46db // smlalt z27.s, p4/M, z22.h, z31.h\n"
+ ".inst 0x449f4689 // smlalt z9.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x454c116b // ssublb z11.h, z11.b, z12.b\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480471e // smlalt z30.s, p4/M, z24.h, z0.h\n"
+ "ld1sb { z24.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x448043b5 // smlalb z21.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ "ldr x21, [x5, #0xb0]\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldr x13, [x5, #0xb8]\n"
+ ".inst 0x448047bb // smlalt z27.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ "ld1sb { z0.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454e1318 // ssublb z24.h, z24.b, z14.b\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ "ld1sb { z5.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldr x12, [x5, #0xc0]\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "ldr x10, [x5, #0xd0]\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n"
+ "ldr x9, [x5, #0xd8]\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ "ld1sb { z26.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ "ld1sb { z3.h }, p4/Z, [x4, #5, MUL VL]\n"
+ "ldr x28, [x5, #0xe0]\n"
+ ".inst 0x449c4051 // smlalb z17.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "ldr x26, [x5, #0xf0]\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4448 // smlalt z8.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x454e135a // ssublb z26.h, z26.b, z14.b\n"
+ "ldr x25, [x5, #0xf8]\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ ".inst 0x44924046 // smlalb z6.s, p4/M, z2.h, z18.h\n"
+ ".inst 0x4492445e // smlalt z30.s, p4/M, z2.h, z18.h\n"
"ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
- ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
- ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
- "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
- ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- "ldr x20, [x17, #0x70]\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
- "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
- ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
- ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
- ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "ldr x24, [x5, #0x100]\n"
+ ".inst 0x449242d1 // smlalb z17.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x44924275 // smlalb z21.s, p4/M, z19.h, z18.h\n"
+ "ldr x23, [x5, #0x108]\n"
+ "ldr x22, [x5, #0x110]\n"
+ ".inst 0x449242e7 // smlalb z7.s, p4/M, z23.h, z18.h\n"
+ ".inst 0x449246c8 // smlalt z8.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x454e139c // ssublb z28.h, z28.b, z14.b\n"
+ "ldr x20, [x5, #0x118]\n"
+ ".inst 0x4492467b // smlalt z27.s, p4/M, z19.h, z18.h\n"
+ ".inst 0x449246e9 // smlalt z9.s, p4/M, z23.h, z18.h\n"
+ "ld1sb { z18.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446de // smlalt z30.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
- ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
- ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
- ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
- "ldr x20, [x17, #0x78]\n"
- ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
- ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
- "ld1sb { z24.h }, p4/Z, [x4]\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
- ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
- "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
- ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
- "ldr x22, [x17, #0x80]\n"
+ ".inst 0x44844291 // smlalb z17.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x448442f5 // smlalb z21.s, p4/M, z23.h, z4.h\n"
+ "whilelt p0.h, x6, x3\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44844027 // smlalb z7.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x44844688 // smlalt z8.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x454e1252 // ssublb z18.h, z18.b, z14.b\n"
+ "ld1sb { z20.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x448446fb // smlalt z27.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44844429 // smlalt z9.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x454c12d6 // ssublb z22.h, z22.b, z12.b\n"
+ "ld1sb { z4.h }, p4/Z, [x4]\n"
+ ".inst 0x448b43a6 // smlalb z6.s, p4/M, z29.h, z11.h\n"
+ ".inst 0x448b47be // smlalt z30.s, p4/M, z29.h, z11.h\n"
+ "ld1sb { z29.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x448b4211 // smlalb z17.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x448b4315 // smlalb z21.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x454e1294 // ssublb z20.h, z20.b, z14.b\n"
+ ".inst 0x448b4007 // smlalb z7.s, p4/M, z0.h, z11.h\n"
+ ".inst 0x448b4608 // smlalt z8.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448b471b // smlalt z27.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x448b4409 // smlalt z9.s, p4/M, z0.h, z11.h\n"
+ "ld1sb { z11.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x454e13bd // ssublb z29.h, z29.b, z14.b\n"
+ ".inst 0x449f4206 // smlalb z6.s, p4/M, z16.h, z31.h\n"
+ ".inst 0x449f461e // smlalt z30.s, p4/M, z16.h, z31.h\n"
"ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
- ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
- ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
- "ldr x21, [x17, #0x88]\n"
- ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
- ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- "ldr x20, [x17, #0x90]\n"
- ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
- "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
- ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
- ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
- "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
- ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
- "ldr x23, [x17, #0x98]\n"
- "ldr x22, [x17, #0xa0]\n"
- ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
- ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
- "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
- ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
- ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
- "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
- ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
- ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
- "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
- ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
- "ldr x20, [x17, #0xa8]\n"
- "ldr x21, [x17, #0xb0]\n"
- ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
- ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
- "ldr x13, [x17, #0xb8]\n"
- "ldr x12, [x17, #0xc0]\n"
- ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
- ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
- "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
- ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
- ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
- "ldr x11, [x17, #0xc8]\n"
- "ldr x10, [x17, #0xd0]\n"
- ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
- ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
- "ldr x9, [x17, #0xd8]\n"
- "ldr x28, [x17, #0xe0]\n"
- ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
- ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
- "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
- ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
- ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449f4331 // smlalb z17.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x449f4015 // smlalb z21.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4347 // smlalb z7.s, p4/M, z26.h, z31.h\n"
+ ".inst 0x449f4728 // smlalt z8.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x454e116b // ssublb z11.h, z11.b, z14.b\n"
+ ".inst 0x449f441b // smlalt z27.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4749 // smlalt z9.s, p4/M, z26.h, z31.h\n"
+ "ld1sb { z31.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x454c1210 // ssublb z16.h, z16.b, z12.b\n"
+ ".inst 0x44854326 // smlalb z6.s, p4/M, z25.h, z5.h\n"
+ ".inst 0x4485473e // smlalt z30.s, p4/M, z25.h, z5.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x44854271 // smlalb z17.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854387 // smlalb z7.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854668 // smlalt z8.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x454e13ff // ssublb z31.h, z31.b, z14.b\n"
+ ".inst 0x4485475b // smlalt z27.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ "ld1sb { z5.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x44834266 // smlalb z6.s, p4/M, z19.h, z3.h\n"
+ ".inst 0x4483467e // smlalt z30.s, p4/M, z19.h, z3.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x448342f1 // smlalb z17.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834247 // smlalb z7.s, p4/M, z18.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x454e10a5 // ssublb z5.h, z5.b, z14.b\n"
+ ".inst 0x4483479b // smlalt z27.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834649 // smlalt z9.s, p4/M, z18.h, z3.h\n"
+ "ld1sb { z3.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
+ ".inst 0x448242e6 // smlalb z6.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246fe // smlalt z30.s, p4/M, z23.h, z2.h\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x44824031 // smlalb z17.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x44824255 // smlalb z21.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824287 // smlalb z7.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x44824428 // smlalt z8.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ "ld1sb { z1.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x4482465b // smlalt z27.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824689 // smlalt z9.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
"ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
- ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
- "ldr x27, [x17, #0xe8]\n"
- "ldr x26, [x17, #0xf0]\n"
- ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
- "ldr x25, [x17, #0xf8]\n"
- "ldr x24, [x17, #0x100]\n"
- ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
- ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
- "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
- ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
- ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
- "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
- ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
- "ldr x23, [x17, #0x108]\n"
- "ldr x22, [x17, #0x110]\n"
- ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
- ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
- "ldr x20, [x17, #0x118]\n"
- "whilelt p0.h, x16, x3\n"
- ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
- ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
- "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
- ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
- ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
- "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
- "inch x4, ALL, MUL #8\n"
- ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
- ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
- ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
- "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
- ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
- ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
- ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
- "ld1sb { z19.h }, p4/Z, [x4]\n"
- ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
- ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
- ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
- "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
- ".inst 0x455e1210 // ssublb z16.h, z16.b, z30.b\n"
- ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
- ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
- ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
- ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
- ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
- "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
- ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
- ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
- "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
- ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
- ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
- ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
- ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
- "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
- ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
- "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
- ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
- ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
- ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
- ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
- ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
- ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
- "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
- ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
- ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
- ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
- ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
- ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
- "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
- ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
- "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
- ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
- ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
- "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
- ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
- ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
- ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
- "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
- ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
- "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
- ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
- ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
- ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
- ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
- ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
- "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
- ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
- ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
- "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
- ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
- ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
- ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
- ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44964306 // smlalb z6.s, p4/M, z24.h, z22.h\n"
+ ".inst 0x4496471e // smlalt z30.s, p4/M, z24.h, z22.h\n"
+ "ld1sb { z24.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x44964011 // smlalb z17.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x449643b5 // smlalb z21.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n"
+ ".inst 0x44964167 // smlalb z7.s, p4/M, z11.h, z22.h\n"
+ ".inst 0x44964408 // smlalt z8.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ ".inst 0x449647bb // smlalt z27.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x44964569 // smlalt z9.s, p4/M, z11.h, z22.h\n"
+ "ld1sb { z22.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x454e1318 // ssublb z24.h, z24.b, z14.b\n"
+ ".inst 0x44844006 // smlalb z6.s, p4/M, z0.h, z4.h\n"
+ ".inst 0x4484441e // smlalt z30.s, p4/M, z0.h, z4.h\n"
+ "ld1sb { z0.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44844351 // smlalb z17.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844175 // smlalb z21.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448443e7 // smlalb z7.s, p4/M, z31.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x454e12d6 // ssublb z22.h, z22.b, z14.b\n"
+ ".inst 0x4484457b // smlalt z27.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448447e9 // smlalt z9.s, p4/M, z31.h, z4.h\n"
"ld1sb { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
- ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
- "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x44904346 // smlalb z6.s, p4/M, z26.h, z16.h\n"
+ ".inst 0x4490475e // smlalt z30.s, p4/M, z26.h, z16.h\n"
+ "ld1sb { z26.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
- ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
- ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
- ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
- ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
- ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
- "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
- ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
- ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
- "ld1sb { z21.h }, p4/Z, [x4]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
- ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
- "inch x4\n"
- ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
- ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
- ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
- ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
- "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
- ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44904391 // smlalb z17.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x449043f5 // smlalb z21.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449040a7 // smlalb z7.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x44904788 // smlalt z8.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n"
+ ".inst 0x449047fb // smlalt z27.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449044a9 // smlalt z9.s, p4/M, z5.h, z16.h\n"
+ "ld1sb { z16.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
- "ld1w { z22.s }, p2/Z, [x15]\n"
- ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
- ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
- ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
- "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
- "addvl x15, x15, #2\n"
- ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
- "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
- ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
- "uzp1 z25.s, z22.s, z16.s\n"
+ ".inst 0x4499479e // smlalt z30.s, p4/M, z28.h, z25.h\n"
+ "ld1sb { z28.h }, p4/Z, [x4]\n"
+ "inch x4\n"
+ ".inst 0x44994251 // smlalb z17.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x449940b5 // smlalb z21.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994067 // smlalb z7.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994648 // smlalt z8.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x454e1210 // ssublb z16.h, z16.b, z14.b\n"
+ ".inst 0x449944bb // smlalt z27.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994469 // smlalt z9.s, p4/M, z3.h, z25.h\n"
+ "ld1sb { z25.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x44934246 // smlalb z6.s, p4/M, z18.h, z19.h\n"
+ ".inst 0x4493465e // smlalt z30.s, p4/M, z18.h, z19.h\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ ".inst 0x44934291 // smlalb z17.s, p4/M, z20.h, z19.h\n"
+ ".inst 0x44934075 // smlalb z21.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934027 // smlalb z7.s, p4/M, z1.h, z19.h\n"
+ ".inst 0x44934688 // smlalt z8.s, p4/M, z20.h, z19.h\n"
+ "ld1w { z20.s }, p1/Z, [x7, #1, MUL VL]\n"
+ ".inst 0x454e1339 // ssublb z25.h, z25.b, z14.b\n"
+ ".inst 0x4493447b // smlalt z27.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934429 // smlalt z9.s, p4/M, z1.h, z19.h\n"
+ "ld1sb { z19.h }, p3/Z, [x20, x2]\n"
"inch x2\n"
- ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
- ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
- "uzp2 z16.s, z22.s, z16.s\n"
- "ld1w { z22.s }, p2/Z, [x14]\n"
- ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
- ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449743a6 // smlalb z6.s, p4/M, z29.h, z23.h\n"
+ ".inst 0x449747be // smlalt z30.s, p4/M, z29.h, z23.h\n"
+ "addvl x7, x7, #2\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974315 // smlalb z21.s, p4/M, z24.h, z23.h\n"
+ "uzp1 z29.s, z18.s, z20.s\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ "uzp2 z18.s, z18.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x8]\n"
+ ".inst 0x4497471b // smlalt z27.s, p4/M, z24.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z24.s }, p1/Z, [x8, #1, MUL VL]\n"
+ ".inst 0x454e1273 // ssublb z19.h, z19.b, z14.b\n"
+ ".inst 0x44824166 // smlalb z6.s, p4/M, z11.h, z2.h\n"
+ ".inst 0x4482457e // smlalt z30.s, p4/M, z11.h, z2.h\n"
"mov x20, x2\n"
- "incw x20\n"
- ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
- "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z29.s, z22.s, z26.s\n"
- ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
- ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
- "uzp2 z22.s, z22.s, z26.s\n"
"whilelt p2.s, x2, x3\n"
- ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x448243f1 // smlalb z17.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448242d5 // smlalb z21.s, p4/M, z22.h, z2.h\n"
+ "addvl x8, x8, #2\n"
+ ".inst 0x44824087 // smlalb z7.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "uzp1 z23.s, z20.s, z24.s\n"
+ ".inst 0x448246db // smlalt z27.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x44824489 // smlalt z9.s, p4/M, z4.h, z2.h\n"
+ "uzp2 z22.s, z20.s, z24.s\n"
+ "incw x20\n"
+ ".inst 0x448043e6 // smlalb z6.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047fe // smlalt z30.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x44804095 // smlalb z21.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
"whilelt p1.s, x20, x3\n"
"whilelt p3.h, x2, x3\n"
- ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
- ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
- "addvl x14, x14, #2\n"
- ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
- ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
- ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
- ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
- ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
- ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
- ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
- "and z3.d, z14.d, z29.d\n"
- ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
- ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
- "asr z3.s, z3.s, #0x1f\n"
- ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
- "sqadd z14.s, z14.s, z3.s\n"
- ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
- "and z31.d, z23.d, z22.d\n"
- ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
- ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
- ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
- ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
- ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
- ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
- ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
- ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
- ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
- "asr z31.s, z31.s, #0x1f\n"
- "and z3.d, z6.d, z29.d\n"
- ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
- "and z0.d, z9.d, z29.d\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- "and z19.d, z7.d, z29.d\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
- "sqadd z23.s, z23.s, z31.s\n"
- ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "and z21.d, z18.d, z22.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z17.d, z20.d, z22.d\n"
+ ".inst 0x4480449b // smlalt z27.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c4031 // smlalb z17.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4428 // smlalt z8.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x04bd74c6 // sqrdmulh z6.s, z6.s, z29.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04bd7631 // sqrdmulh z17.s, z17.s, z29.s\n"
+ ".inst 0x04bd76b5 // sqrdmulh z21.s, z21.s, z29.s\n"
+ "and z19.d, z6.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
+ "and z16.d, z30.d, z22.d\n"
+ "and z2.d, z17.d, z23.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z20.d, z21.d, z23.d\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z19.s\n"
+ "and z19.d, z7.d, z23.d\n"
+ "and z0.d, z8.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z30.s, z30.s, z16.s\n"
+ "and z26.d, z27.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z16.d, z1.d, z22.d\n"
- "sqadd z6.s, z6.s, z3.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
- "sqadd z9.s, z9.s, z0.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "and z16.d, z9.d, z22.d\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
+ "sqadd z17.s, z17.s, z2.s\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z20.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ ".inst 0x448292de // srshl z30.s, p4/M, z30.s, z22.s\n"
"sqadd z7.s, z7.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "sqadd z20.s, z20.s, z17.s\n"
- ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
- ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x448292f1 // srshl z17.s, p4/M, z17.s, z23.s\n"
+ "sqadd z8.s, z8.s, z0.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x448292f5 // srshl z21.s, p4/M, z21.s, z23.s\n"
+ "sqadd z27.s, z27.s, z26.s\n"
+ ".inst 0x448292e7 // srshl z7.s, p4/M, z7.s, z23.s\n"
+ "sqadd z9.s, z9.s, z16.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x448292c8 // srshl z8.s, p4/M, z8.s, z22.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x453047c6 // sqxtnt z6.h, z30.s\n"
+ ".inst 0x448292db // srshl z27.s, p4/M, z27.s, z22.s\n"
+ ".inst 0x448292c9 // srshl z9.s, p4/M, z9.s, z22.s\n"
".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
- ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
- ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
- ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
- ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
- "sqadd z14.h, z14.h, z15.h\n"
- "smax z14.h, p4/M, z14.h, z12.h\n"
- "smin z14.h, p4/M, z14.h, z13.h\n"
- "sqadd z6.h, z6.h, z15.h\n"
- "sqadd z9.h, z9.h, z15.h\n"
- "smax z6.h, p4/M, z6.h, z12.h\n"
- "smax z9.h, p4/M, z9.h, z12.h\n"
- "sqadd z7.h, z7.h, z15.h\n"
- "smax z7.h, p4/M, z7.h, z12.h\n"
+ ".inst 0x45304511 // sqxtnt z17.h, z8.s\n"
+ ".inst 0x45304775 // sqxtnt z21.h, z27.s\n"
+ ".inst 0x45304527 // sqxtnt z7.h, z9.s\n"
+ "sqadd z6.h, z6.h, z10.h\n"
+ "sqadd z17.h, z17.h, z10.h\n"
+ "sqadd z21.h, z21.h, z10.h\n"
+ "sqadd z7.h, z7.h, z10.h\n"
+ "smax z6.h, p4/M, z6.h, z15.h\n"
+ "smax z17.h, p4/M, z17.h, z15.h\n"
+ "smax z21.h, p4/M, z21.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z15.h\n"
"smin z6.h, p4/M, z6.h, z13.h\n"
- "st1b { z14.h }, p0, [x5, x16]\n"
- "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z17.h, p4/M, z17.h, z13.h\n"
+ "smin z21.h, p4/M, z21.h, z13.h\n"
"smin z7.h, p4/M, z7.h, z13.h\n"
- "st1b { z6.h }, p0, [x6, x16]\n"
- "st1b { z9.h }, p0, [x7, x16]\n"
- "st1b { z7.h }, p0, [x8, x16]\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
+ "st1b { z6.h }, p0, [x17, x6]\n"
+ "st1b { z17.h }, p0, [x16, x6]\n"
+ "st1b { z21.h }, p0, [x15, x6]\n"
+ "st1b { z7.h }, p0, [x14, x6]\n"
+ "inch x6\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
"ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1sb { z26.h }, p4/Z, [x4]\n"
- "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
"addvl x21, x21, #2\n"
- "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "inch x16\n"
+ "ld1sb { z25.h }, p4/Z, [x4]\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z21.s, z16.s\n"
+ "uzp2 z30.s, z21.s, z16.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z6.d, z14.d\n"
- "mov z18.d, z23.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z9.d, z14.d\n"
- "mov z20.d, z23.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z7.d, z14.d\n"
- "mov z1.d, z23.d\n"
- "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
- "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
- ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
- "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
- "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
- "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
- ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
- "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
- "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
- "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
- "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
- ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
- ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1sb { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1sb { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1sb { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1sb { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e135a // ssublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1210 // ssublb z16.h, z16.b, z14.b\n"
+ "ld1sb { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1318 // ssublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e10a5 // ssublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1252 // ssublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1273 // ssublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e116b // ssublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1294 // ssublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e13bd // ssublb z29.h, z29.b, z14.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index 274b29dcfc..5a9f8e69ad 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,288 +41,288 @@ void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov x20, #0x9\n"
- "whilelt p0.b, XZR, x20\n"
- "ldr x23, [%x[inptrs], #0x8]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov x25, #0x9\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "ldr x23, [%x[inptrs], #0x10]\n"
+ "mov z22.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z13.b, #0x1\n"
- "lsr z13.s, z13.s, #0x8\n"
- "ld1b { z1.b }, p0/Z, [x23]\n"
- "ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z8.d, z1.d\n"
- "mov z27.d, z1.d\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z31.d, z1.d\n"
- "mov z28.d, z2.d\n"
- "ld1b { z0.b }, p0/Z, [x21]\n"
- "mov z30.d, z2.d\n"
- "mov z26.d, z2.d\n"
- "ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z22.d, z4.d\n"
- "mov z10.d, z4.d\n"
+ "lsr z22.s, z22.s, #0x8\n"
+ "mov z29.s, #0x0\n"
"ptrue p2.b\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z18.d, z4.d\n"
- "ext z8.b, z8.b, z8.b, #0x2\n"
+ "whilelt p0.b, XZR, x25\n"
+ "mov z14.s, #0x0\n"
+ "mov z23.s, #0x0\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z11.s, p2/M, z11.s\n"
- "ext z27.b, z27.b, z27.b, #0x4\n"
- "ext z31.b, z31.b, z31.b, #0x6\n"
+ "mov z11.s, #0x0\n"
+ "mov z15.s, #0x0\n"
"mov x9, #0x0\n"
- "whilelt p0.b, x9, x10\n"
- "ext z28.b, z28.b, z28.b, #0x2\n"
- "ext z30.b, z30.b, z30.b, #0x4\n"
- "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "ext z22.b, z22.b, z22.b, #0x2\n"
+ "mov z31.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "ld1b { z2.b }, p0/Z, [x23]\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z24.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z27.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "neg z16.s, p2/M, z16.s\n"
+ "mov z5.d, z1.d\n"
+ "mov z7.d, z1.d\n"
+ "whilelt p0.b, x9, x10\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z10.b, z10.b, z10.b, #0x4\n"
- "ext z18.b, z18.b, z18.b, #0x6\n"
+ "mov z30.d, z1.d\n"
+ "mov z6.d, z2.d\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "mov z21.d, z0.d\n"
- "mov z20.d, z0.d\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "mov z19.d, z0.d\n"
- "mov z24.d, z3.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z8.d, z2.d\n"
+ "mov z19.d, z2.d\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z9.d, z4.d\n"
+ "mov z28.d, z4.d\n"
+ "ext z5.b, z5.b, z5.b, #0x2\n"
+ "ext z7.b, z7.b, z7.b, #0x4\n"
+ "ext z30.b, z30.b, z30.b, #0x6\n"
+ "ext z6.b, z6.b, z6.b, #0x2\n"
+ "ext z8.b, z8.b, z8.b, #0x4\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "ext z9.b, z9.b, z9.b, #0x2\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
+ "zip1 z1.s, z1.s, z7.s\n"
+ "mov z7.d, z4.d\n"
+ "zip1 z5.s, z5.s, z30.s\n"
+ "mov z30.d, z0.d\n"
+ "ext z7.b, z7.b, z7.b, #0x6\n"
+ "zip1 z2.s, z2.s, z8.s\n"
+ "ld1w { z8.s }, p0/Z, [%x[params]]\n"
+ "ext z30.b, z30.b, z30.b, #0x2\n"
+ "zip1 z6.s, z6.s, z19.s\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z4.s, z4.s, z28.s\n"
+ "mov z28.d, z0.d\n"
+ "zip1 z9.s, z9.s, z7.s\n"
+ "mov z7.d, z0.d\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
+ "zip1 z1.s, z1.s, z5.s\n"
"ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
- "mov z17.d, z3.d\n"
- "mov z16.d, z3.d\n"
+ "ext z7.b, z7.b, z7.b, #0x6\n"
+ "zip1 z2.s, z2.s, z6.s\n"
"ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z4.s, z4.s, z9.s\n"
+ "mov z9.d, z3.d\n"
+ "zip1 z0.s, z0.s, z28.s\n"
+ "mov z28.d, z3.d\n"
+ "ext z9.b, z9.b, z9.b, #0x2\n"
+ "zip1 z30.s, z30.s, z7.s\n"
"ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
- "ext z21.b, z21.b, z21.b, #0x2\n"
- "ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
- "ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z27.s\n"
- "zip1 z8.s, z8.s, z31.s\n"
- "zip1 z2.s, z2.s, z30.s\n"
- "zip1 z28.s, z28.s, z26.s\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z10.s\n"
- "zip1 z22.s, z22.s, z18.s\n"
- "zip1 z0.s, z0.s, z20.s\n"
- "zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z8.s\n"
- "zip1 z2.s, z2.s, z28.s\n"
- "zip1 z3.s, z3.s, z17.s\n"
- "zip1 z24.s, z24.s, z16.s\n"
- "zip1 z4.s, z4.s, z22.s\n"
- "zip1 z0.s, z0.s, z21.s\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "sdot z24.s, z13.b, z1.b[0]\n"
- "mov z23.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "sdot z25.s, z13.b, z1.b[1]\n"
- "mov z21.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "sdot z23.s, z13.b, z1.b[2]\n"
- "mov z10.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "sdot z22.s, z13.b, z1.b[3]\n"
- "mov z20.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "sdot z21.s, z13.b, z2.b[0]\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "sdot z19.s, z13.b, z2.b[1]\n"
- "sdot z10.s, z13.b, z2.b[2]\n"
- "sdot z8.s, z13.b, z2.b[3]\n"
+ "zip1 z0.s, z0.s, z30.s\n"
+ "mov z30.d, z3.d\n"
+ "sdot z25.s, z22.b, z1.b[0]\n"
+ "zip1 z3.s, z3.s, z28.s\n"
+ "sdot z26.s, z22.b, z1.b[1]\n"
+ "sdot z29.s, z22.b, z1.b[2]\n"
+ "ext z30.b, z30.b, z30.b, #0x6\n"
+ "sdot z14.s, z22.b, z1.b[3]\n"
+ "sdot z23.s, z22.b, z2.b[0]\n"
+ "sdot z11.s, z22.b, z2.b[1]\n"
+ "sdot z15.s, z22.b, z2.b[2]\n"
"mov z0.q, z0.q[0]\n"
- "sdot z20.s, z13.b, z4.b[0]\n"
- "sdot z18.s, z13.b, z4.b[1]\n"
- "mov z3.q, z3.q[0]\n"
- "sdot z17.s, z13.b, z4.b[2]\n"
- "sdot z16.s, z13.b, z4.b[3]\n"
- "mov z31.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "sdot z31.s, z13.b, z0.b[0]\n"
- "mov z27.s, #0x0\n"
+ "sdot z31.s, z22.b, z2.b[3]\n"
+ "sdot z17.s, z22.b, z4.b[0]\n"
"mov z28.s, #0x0\n"
- "sdot z30.s, z13.b, z0.b[1]\n"
- "mov z29.s, #0x0\n"
- "sdot z26.s, z13.b, z0.b[2]\n"
- "sdot z27.s, z13.b, z0.b[3]\n"
- "sdot z28.s, z13.b, z3.b[0]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z19.s\n"
- "add z23.s, z23.s, z10.s\n"
- "add z22.s, z22.s, z8.s\n"
- "add z21.s, z20.s, z21.s\n"
+ "zip1 z9.s, z9.s, z30.s\n"
+ "sdot z20.s, z22.b, z4.b[1]\n"
+ "sdot z21.s, z22.b, z4.b[2]\n"
+ "sdot z24.s, z22.b, z4.b[3]\n"
+ "mov z30.s, #0x0\n"
+ "sdot z12.s, z22.b, z0.b[0]\n"
+ "sdot z27.s, z22.b, z0.b[1]\n"
+ "sdot z18.s, z22.b, z0.b[2]\n"
+ "add z25.s, z25.s, z23.s\n"
+ "zip1 z3.s, z3.s, z9.s\n"
+ "mov z9.s, #0x0\n"
+ "sdot z28.s, z22.b, z0.b[3]\n"
+ "add z26.s, z26.s, z11.s\n"
+ "add z29.s, z29.s, z15.s\n"
+ "add z14.s, z14.s, z31.s\n"
+ "add z23.s, z17.s, z23.s\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z17.s, #0x0\n"
+ "add z11.s, z20.s, z11.s\n"
"mov z20.s, #0x0\n"
- "sdot z20.s, z13.b, z3.b[2]\n"
- "add z19.s, z18.s, z19.s\n"
- "mov z18.s, #0x0\n"
- "sdot z18.s, z13.b, z3.b[3]\n"
- "add z17.s, z17.s, z10.s\n"
- "add z16.s, z16.s, z8.s\n"
- "add z24.s, z24.s, z31.s\n"
- "add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z11.s\n"
- "mul z25.s, p2/M, z25.s, z11.s\n"
- "add z26.s, z23.s, z26.s\n"
- "add z27.s, z22.s, z27.s\n"
- "mul z26.s, p2/M, z26.s, z11.s\n"
- "mul z27.s, p2/M, z27.s, z11.s\n"
- "add z28.s, z21.s, z28.s\n"
- "add z29.s, z19.s, z29.s\n"
- "mul z28.s, p2/M, z28.s, z11.s\n"
- "mul z29.s, p2/M, z29.s, z11.s\n"
- "add z30.s, z17.s, z20.s\n"
- "add z31.s, z16.s, z18.s\n"
- "mul z30.s, p2/M, z30.s, z11.s\n"
- "mul z31.s, p2/M, z31.s, z11.s\n"
- "zip1 z19.s, z24.s, z26.s\n"
- "zip1 z18.s, z25.s, z27.s\n"
+ "sdot z30.s, z22.b, z3.b[0]\n"
+ "sdot z9.s, z22.b, z3.b[1]\n"
+ "sdot z17.s, z22.b, z3.b[2]\n"
+ "add z15.s, z21.s, z15.s\n"
+ "sdot z20.s, z22.b, z3.b[3]\n"
+ "add z31.s, z24.s, z31.s\n"
+ "add z24.s, z25.s, z12.s\n"
+ "add z25.s, z26.s, z27.s\n"
+ "add z26.s, z29.s, z18.s\n"
+ "add z27.s, z14.s, z28.s\n"
+ "add z28.s, z23.s, z30.s\n"
+ "add z29.s, z11.s, z9.s\n"
+ "add z30.s, z15.s, z17.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "mul z24.s, p2/M, z24.s, z16.s\n"
+ "mul z25.s, p2/M, z25.s, z16.s\n"
+ "mul z26.s, p2/M, z26.s, z16.s\n"
+ "mul z27.s, p2/M, z27.s, z16.s\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
+ "mul z29.s, p2/M, z29.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z16.s\n"
+ "mul z31.s, p2/M, z31.s, z16.s\n"
+ "zip1 z21.s, z24.s, z26.s\n"
+ "add z24.s, z24.s, z8.s\n"
+ "zip1 z23.s, z25.s, z27.s\n"
+ "add z25.s, z25.s, z8.s\n"
+ "add z26.s, z26.s, z8.s\n"
+ "add z27.s, z27.s, z8.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
- "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z22.s, z21.s, z23.s\n"
+ "add z28.s, z28.s, z8.s\n"
+ "add z29.s, z29.s, z8.s\n"
+ "add z30.s, z30.s, z8.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z31.s, z31.s, z8.s\n"
"1:" // Loop
"sdot z24.s, z5.b, z0.b[0]\n"
"sdot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
"ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"sdot z26.s, z5.b, z0.b[2]\n"
"sdot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
"whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z28.s, z5.b, z2.b[0]\n"
+ "sdot z29.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z2.b[2]\n"
+ "sdot z31.s, z5.b, z2.b[3]\n"
"sdot z24.s, z6.b, z1.b[0]\n"
"sdot z25.s, z6.b, z1.b[1]\n"
"whilelt p0.b, x9, x10\n"
- "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"sdot z26.s, z6.b, z1.b[2]\n"
"sdot z27.s, z6.b, z1.b[3]\n"
- "sdot z28.s, z5.b, z2.b[0]\n"
- "sdot z29.s, z5.b, z2.b[1]\n"
- "sdot z30.s, z5.b, z2.b[2]\n"
- "sdot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
- "sdot z24.s, z7.b, z2.b[0]\n"
- "sdot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
- "sdot z26.s, z7.b, z2.b[2]\n"
- "sdot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"sdot z28.s, z6.b, z3.b[0]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"sdot z30.s, z6.b, z3.b[2]\n"
"sdot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[1]\n"
"ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
"sdot z28.s, z7.b, z4.b[0]\n"
"sdot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z21.d\n"
"sdot z30.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z21.d\n"
"ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ ".inst 0x04af7718 // sqrdmulh z24.s, z24.s, z15.s\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ ".inst 0x04af775a // sqrdmulh z26.s, z26.s, z15.s\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ ".inst 0x04af779c // sqrdmulh z28.s, z28.s, z15.s\n"
+ ".inst 0x04af77bd // sqrdmulh z29.s, z29.s, z15.s\n"
+ "and z14.d, z24.d, z21.d\n"
+ "and z12.d, z25.d, z21.d\n"
"and z17.d, z26.d, z21.d\n"
"and z16.d, z27.d, z21.d\n"
- "addvl %x[params], %x[params], #6\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04af77de // sqrdmulh z30.s, z30.s, z15.s\n"
+ ".inst 0x04af77ff // sqrdmulh z31.s, z31.s, z15.s\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ "asr z12.s, z12.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
- ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
- ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
- ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
- "sqadd z24.s, z24.s, z19.s\n"
- "sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z24.s, z24.s, z14.s\n"
+ "and z14.d, z28.d, z21.d\n"
+ "sqadd z25.s, z25.s, z12.s\n"
+ "and z11.d, z29.d, z21.d\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
"and z17.d, z30.d, z21.d\n"
"and z16.d, z31.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z28.s, z28.s, z19.s\n"
- "sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z24.s, z24.s, z10.s\n"
+ "sqadd z28.s, z28.s, z14.s\n"
+ "sqadd z29.s, z29.s, z11.s\n"
+ "add z25.s, z25.s, z10.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "add z26.s, z26.s, z10.s\n"
+ "add z27.s, z27.s, z10.s\n"
+ "smin z24.s, p2/M, z24.s, z19.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "smin z25.s, p2/M, z25.s, z19.s\n"
".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "add z24.s, z24.s, z9.s\n"
- "add z25.s, z25.s, z9.s\n"
- "smin z24.s, p2/M, z24.s, z12.s\n"
- "smin z25.s, p2/M, z25.s, z12.s\n"
- "add z26.s, z26.s, z9.s\n"
- "add z27.s, z27.s, z9.s\n"
- "smin z26.s, p2/M, z26.s, z12.s\n"
- "smin z27.s, p2/M, z27.s, z12.s\n"
- "add z28.s, z28.s, z9.s\n"
- "add z29.s, z29.s, z9.s\n"
- "smin z28.s, p2/M, z28.s, z12.s\n"
- "smin z29.s, p2/M, z29.s, z12.s\n"
- "add z30.s, z30.s, z9.s\n"
- "add z31.s, z31.s, z9.s\n"
- "smin z30.s, p2/M, z30.s, z12.s\n"
- "smin z31.s, p2/M, z31.s, z12.s\n"
- "smax z24.s, p2/M, z24.s, z15.s\n"
- "smax z25.s, p2/M, z25.s, z15.s\n"
+ "add z28.s, z28.s, z10.s\n"
+ "add z29.s, z29.s, z10.s\n"
+ "smin z26.s, p2/M, z26.s, z19.s\n"
+ "smin z27.s, p2/M, z27.s, z19.s\n"
+ "smax z24.s, p2/M, z24.s, z13.s\n"
+ "add z30.s, z30.s, z10.s\n"
+ "smax z25.s, p2/M, z25.s, z13.s\n"
+ "add z31.s, z31.s, z10.s\n"
+ "smin z28.s, p2/M, z28.s, z19.s\n"
+ "smin z29.s, p2/M, z29.s, z19.s\n"
+ "smax z26.s, p2/M, z26.s, z13.s\n"
+ "smin z30.s, p2/M, z30.s, z19.s\n"
+ "smax z27.s, p2/M, z27.s, z13.s\n"
"st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z15.s\n"
- "smax z27.s, p2/M, z27.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z19.s\n"
+ "smax z28.s, p2/M, z28.s, z13.s\n"
"st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z15.s\n"
- "smax z29.s, p2/M, z29.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z13.s\n"
"st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z15.s\n"
- "smax z31.s, p2/M, z31.s, z15.s\n"
+ "add z24.s, z24.s, z20.s\n"
+ "smax z30.s, p2/M, z30.s, z13.s\n"
"st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "smax z31.s, p2/M, z31.s, z13.s\n"
"st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z20.s\n"
+ "add z26.s, z26.s, z20.s\n"
"st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z20.s\n"
+ "add z27.s, z27.s, z20.s\n"
"st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
"st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z20.s\n"
- "add z28.s, z28.s, z20.s\n"
"add z29.s, z29.s, z20.s\n"
"add z30.s, z30.s, z20.s\n"
"add z31.s, z31.s, z20.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index a3b2b429c0..7843bfe1be 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,353 +42,353 @@ void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
{
__asm__ __volatile__(
"mov x20, #0x6\n"
- "whilelt p0.b, XZR, x20\n"
- "ldr x22, [%x[inptrs], #0x18]\n"
- "ldr x21, [%x[inptrs], #0x20]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z23.d, z3.d\n"
- "ext z23.b, z23.b, z23.b, #0x1\n"
- "ld1b { z4.b }, p0/Z, [x21]\n"
+ "ldr x27, [%x[inptrs], #0x18]\n"
+ "ldr x26, [%x[inptrs], #0x20]\n"
+ "mov z30.b, #0x1\n"
+ "ldr x25, [%x[inptrs], #0x10]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
- "mov z18.d, z4.d\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z14.s, #0x0\n"
+ "mov z27.s, #0x0\n"
"ldr x23, [%x[inptrs], #0x28]\n"
- "mov z15.d, z2.d\n"
- "ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
+ "mov z11.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "whilelt p0.b, XZR, x20\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z23.d\n"
- "zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
+ "mov z28.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z21.s, #0x1\n"
+ "ptrue p2.b\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "mov z24.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x27]\n"
+ "ld1b { z4.b }, p0/Z, [x26]\n"
+ "mov z31.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "ld1b { z2.b }, p0/Z, [x25]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z19.d, z1.d\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "mov z20.s, #0x0\n"
+ "mov z17.s, #0x0\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z18.d, z5.d\n"
- "mov z22.d, z6.d\n"
+ "mov z18.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z16.d, z3.d\n"
+ "mov z13.d, z4.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z8.d, z7.d\n"
- "zip1 z2.d, z2.d, z15.d\n"
- "mov z3.q, z3.q[0]\n"
- "mov z4.q, z4.q[0]\n"
- "ptrue p2.b\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ext z22.b, z22.b, z22.b, #0x1\n"
- "lsl x10, %x[n_channels], #0x2\n"
- "neg z23.s, p2/M, z23.s\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "mov z28.b, #0x1\n"
- "mov x9, #0x0\n"
+ "mov z12.d, z2.d\n"
+ "mov z19.d, z1.d\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"whilelt p0.b, x9, x10\n"
- "mov z25.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "sdot z25.s, z28.b, z3.b[0]\n"
- "ld1w { z12.s }, p0/Z, [%x[params]]\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "sdot z24.s, z28.b, z3.b[2]\n"
- "mov x28, #0x0\n"
- "mov z27.d, z0.d\n"
- "sdot z17.s, z28.b, z4.b[0]\n"
- "sdot z16.s, z28.b, z4.b[2]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "mov z8.d, z5.d\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z22.d\n"
- "zip1 z7.d, z7.d, z8.d\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "sdot z30.s, z28.b, z2.b[0]\n"
+ "mov z10.d, z6.d\n"
+ "mov z9.d, z7.d\n"
+ "neg z15.s, p2/M, z15.s\n"
+ "zip1 z3.d, z3.d, z16.d\n"
+ "zip1 z4.d, z4.d, z13.d\n"
+ "ld1w { z13.s }, p0/Z, [%x[params]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "zip1 z2.d, z2.d, z12.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "sdot z14.s, z30.b, z3.b[0]\n"
+ "sdot z27.s, z30.b, z3.b[2]\n"
+ "sdot z11.s, z30.b, z4.b[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "sdot z22.s, z30.b, z4.b[2]\n"
+ "zip1 z5.d, z5.d, z8.d\n"
"ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
- "mov z29.s, #0x1\n"
- "sdot z31.s, z28.b, z2.b[2]\n"
- "sdot z25.s, z29.b, z3.b[1]\n"
+ "zip1 z6.d, z6.d, z10.d\n"
+ "mov z10.d, z0.d\n"
+ "sdot z28.s, z30.b, z2.b[0]\n"
+ "zip1 z7.d, z7.d, z9.d\n"
+ "sdot z25.s, z30.b, z2.b[2]\n"
+ "sdot z14.s, z21.b, z3.b[1]\n"
"ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z27.d\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
"mov z1.q, z1.q[0]\n"
- "sdot z24.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z27.s, z21.b, z3.b[3]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "sdot z17.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z11.s, z21.b, z4.b[1]\n"
"mov z7.q, z7.q[0]\n"
- "mov z22.s, #0x0\n"
- "sdot z16.s, z29.b, z4.b[3]\n"
+ "sdot z22.s, z21.b, z4.b[3]\n"
+ "sdot z24.s, z30.b, z1.b[0]\n"
+ "zip1 z0.d, z0.d, z10.d\n"
+ "sdot z23.s, z30.b, z1.b[2]\n"
+ "sdot z31.s, z30.b, z5.b[0]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z29.s, z30.b, z5.b[2]\n"
+ "sdot z20.s, z30.b, z6.b[0]\n"
+ "sdot z17.s, z30.b, z6.b[2]\n"
+ "sdot z18.s, z30.b, z7.b[0]\n"
+ "add z14.s, z14.s, z11.s\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z26.s, z30.b, z7.b[2]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z28.s, z21.b, z2.b[1]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z21.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "sdot z22.s, z28.b, z1.b[0]\n"
+ "sdot z25.s, z21.b, z2.b[3]\n"
+ "add z22.s, z27.s, z22.s\n"
+ "sdot z24.s, z21.b, z1.b[1]\n"
"mov z27.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "sdot z21.s, z28.b, z1.b[2]\n"
- "mov z19.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "sdot z26.s, z28.b, z5.b[0]\n"
- "sdot z27.s, z28.b, z5.b[2]\n"
- "sdot z20.s, z28.b, z6.b[0]\n"
- "mov z0.q, z0.q[0]\n"
- "sdot z19.s, z28.b, z6.b[2]\n"
- "sdot z18.s, z28.b, z7.b[0]\n"
- "add z17.s, z25.s, z17.s\n"
- "mov z25.s, #0x0\n"
- "sdot z25.s, z28.b, z7.b[2]\n"
- "sdot z30.s, z29.b, z2.b[1]\n"
- "sdot z31.s, z29.b, z2.b[3]\n"
- "add z16.s, z24.s, z16.s\n"
- "sdot z22.s, z29.b, z1.b[1]\n"
- "mov z24.s, #0x0\n"
- "sdot z24.s, z28.b, z0.b[0]\n"
- "sdot z21.s, z29.b, z1.b[3]\n"
- "sdot z26.s, z29.b, z5.b[1]\n"
- "sdot z27.s, z29.b, z5.b[3]\n"
- "add z30.s, z30.s, z17.s\n"
- "sdot z20.s, z29.b, z6.b[1]\n"
- "sdot z19.s, z29.b, z6.b[3]\n"
- "add z31.s, z31.s, z16.s\n"
- "sdot z18.s, z29.b, z7.b[1]\n"
- "sdot z25.s, z29.b, z7.b[3]\n"
- "add z22.s, z22.s, z30.s\n"
- "sdot z24.s, z29.b, z0.b[1]\n"
- "add z21.s, z21.s, z31.s\n"
- "add z20.s, z26.s, z20.s\n"
- "add z19.s, z27.s, z19.s\n"
- "add z18.s, z18.s, z17.s\n"
- "mov z17.s, #0x0\n"
- "sdot z17.s, z28.b, z0.b[2]\n"
- "sdot z17.s, z29.b, z0.b[3]\n"
- "add z16.s, z25.s, z16.s\n"
- "add z24.s, z22.s, z24.s\n"
- "add z25.s, z21.s, z17.s\n"
- "mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z25.s, p2/M, z25.s, z23.s\n"
- "add z26.s, z26.s, z22.s\n"
- "add z27.s, z27.s, z21.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z30.s\n"
- "add z29.s, z19.s, z31.s\n"
- "mul z28.s, p2/M, z28.s, z23.s\n"
- "mul z29.s, p2/M, z29.s, z23.s\n"
+ "sdot z23.s, z21.b, z1.b[3]\n"
+ "sdot z31.s, z21.b, z5.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[3]\n"
+ "sdot z20.s, z21.b, z6.b[1]\n"
+ "sdot z27.s, z30.b, z0.b[0]\n"
+ "sdot z17.s, z21.b, z6.b[3]\n"
+ "add z28.s, z28.s, z14.s\n"
+ "sdot z18.s, z21.b, z7.b[1]\n"
+ "sdot z26.s, z21.b, z7.b[3]\n"
+ "add z25.s, z25.s, z22.s\n"
+ "add z24.s, z24.s, z28.s\n"
+ "add z20.s, z31.s, z20.s\n"
+ "sdot z27.s, z21.b, z0.b[1]\n"
+ "add z23.s, z23.s, z25.s\n"
+ "add z17.s, z29.s, z17.s\n"
+ "add z18.s, z18.s, z14.s\n"
+ "mov z14.s, #0x0\n"
+ "add z22.s, z26.s, z22.s\n"
+ "add z26.s, z31.s, z24.s\n"
+ "sdot z14.s, z30.b, z0.b[2]\n"
+ "add z24.s, z24.s, z27.s\n"
+ "add z27.s, z29.s, z23.s\n"
+ "add z28.s, z20.s, z28.s\n"
+ "add z29.s, z17.s, z25.s\n"
"add z30.s, z20.s, z18.s\n"
- "add z31.s, z19.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z23.s\n"
- "mul z31.s, p2/M, z31.s, z23.s\n"
- "zip1 z19.s, z24.s, z26.s\n"
- "zip1 z18.s, z25.s, z27.s\n"
- "zip1 z17.s, z28.s, z30.s\n"
- "zip1 z16.s, z29.s, z31.s\n"
- "zip1 z22.s, z19.s, z18.s\n"
- "zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z31.s, z17.s, z22.s\n"
+ "mul z26.s, p2/M, z26.s, z15.s\n"
+ "sdot z14.s, z21.b, z0.b[3]\n"
+ "mul z24.s, p2/M, z24.s, z15.s\n"
+ "mul z27.s, p2/M, z27.s, z15.s\n"
+ "mul z28.s, p2/M, z28.s, z15.s\n"
+ "mul z29.s, p2/M, z29.s, z15.s\n"
+ "mul z30.s, p2/M, z30.s, z15.s\n"
+ "mul z31.s, p2/M, z31.s, z15.s\n"
+ "add z25.s, z23.s, z14.s\n"
+ "zip1 z21.s, z24.s, z26.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "mul z25.s, p2/M, z25.s, z15.s\n"
+ "zip1 z22.s, z28.s, z30.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "zip1 z18.s, z29.s, z31.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "zip1 z14.s, z25.s, z27.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "zip1 z23.s, z22.s, z18.s\n"
+ "add z31.s, z31.s, z13.s\n"
+ "zip1 z22.s, z21.s, z14.s\n"
"1:" // Loop
"sdot z24.s, z8.b, z0.b[0]\n"
"sdot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"sdot z26.s, z8.b, z1.b[0]\n"
"sdot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
"whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z28.s, z8.b, z2.b[0]\n"
+ "sdot z29.s, z8.b, z2.b[2]\n"
+ "sdot z30.s, z8.b, z3.b[0]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
+ "ld1b { z15.b }, p2/Z, [%x[params]]\n"
"sdot z24.s, z9.b, z0.b[1]\n"
"sdot z25.s, z9.b, z0.b[3]\n"
"whilelt p0.b, x9, x10\n"
"sdot z26.s, z9.b, z1.b[1]\n"
"sdot z27.s, z9.b, z1.b[3]\n"
- "sdot z28.s, z8.b, z2.b[0]\n"
- "sdot z29.s, z8.b, z2.b[2]\n"
- "sdot z30.s, z8.b, z3.b[0]\n"
- "sdot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z17.b }, p2/Z, [%x[params]]\n"
- "sdot z24.s, z10.b, z1.b[0]\n"
- "sdot z25.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z27.s, z10.b, z2.b[2]\n"
"sdot z28.s, z9.b, z2.b[1]\n"
"sdot z29.s, z9.b, z2.b[3]\n"
"sdot z30.s, z9.b, z3.b[1]\n"
"sdot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "sdot z24.s, z11.b, z1.b[1]\n"
- "sdot z25.s, z11.b, z1.b[3]\n"
- "sdot z26.s, z11.b, z2.b[1]\n"
- "sdot z27.s, z11.b, z2.b[3]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z24.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
"sdot z28.s, z10.b, z3.b[0]\n"
"sdot z29.s, z10.b, z3.b[2]\n"
"sdot z30.s, z10.b, z4.b[0]\n"
"sdot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sdot z24.s, z17.b, z2.b[0]\n"
- "sdot z25.s, z17.b, z2.b[2]\n"
- "sdot z26.s, z17.b, z3.b[0]\n"
- "sdot z27.s, z17.b, z3.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z24.s, z11.b, z1.b[1]\n"
+ "sdot z25.s, z11.b, z1.b[3]\n"
+ "sdot z26.s, z11.b, z2.b[1]\n"
+ "sdot z27.s, z11.b, z2.b[3]\n"
"sdot z28.s, z11.b, z3.b[1]\n"
"sdot z29.s, z11.b, z3.b[3]\n"
"sdot z30.s, z11.b, z4.b[1]\n"
"sdot z31.s, z11.b, z4.b[3]\n"
"ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z24.s, z16.b, z2.b[1]\n"
- "sdot z25.s, z16.b, z2.b[3]\n"
- "sdot z26.s, z16.b, z3.b[1]\n"
- "sdot z27.s, z16.b, z3.b[3]\n"
- "sdot z28.s, z17.b, z4.b[0]\n"
- "sdot z29.s, z17.b, z4.b[2]\n"
- "sdot z30.s, z17.b, z5.b[0]\n"
- "sdot z31.s, z17.b, z5.b[2]\n"
+ "sdot z24.s, z15.b, z2.b[0]\n"
+ "sdot z25.s, z15.b, z2.b[2]\n"
+ "sdot z26.s, z15.b, z3.b[0]\n"
+ "sdot z27.s, z15.b, z3.b[2]\n"
+ "sdot z28.s, z15.b, z4.b[0]\n"
+ "sdot z29.s, z15.b, z4.b[2]\n"
+ "sdot z30.s, z15.b, z5.b[0]\n"
+ "sdot z31.s, z15.b, z5.b[2]\n"
"ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "sdot z24.s, z19.b, z3.b[0]\n"
- "sdot z25.s, z19.b, z3.b[2]\n"
- "sdot z26.s, z19.b, z4.b[0]\n"
- "sdot z27.s, z19.b, z4.b[2]\n"
- "sdot z28.s, z16.b, z4.b[1]\n"
- "sdot z29.s, z16.b, z4.b[3]\n"
- "sdot z30.s, z16.b, z5.b[1]\n"
- "sdot z31.s, z16.b, z5.b[3]\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "sdot z25.s, z8.b, z2.b[3]\n"
+ "sdot z26.s, z8.b, z3.b[1]\n"
+ "sdot z27.s, z8.b, z3.b[3]\n"
+ "sdot z28.s, z8.b, z4.b[1]\n"
+ "sdot z29.s, z8.b, z4.b[3]\n"
+ "sdot z30.s, z8.b, z5.b[1]\n"
+ "sdot z31.s, z8.b, z5.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
+ "sdot z24.s, z21.b, z3.b[0]\n"
+ "sdot z25.s, z21.b, z3.b[2]\n"
+ "sdot z26.s, z21.b, z4.b[0]\n"
+ "sdot z27.s, z21.b, z4.b[2]\n"
+ "sdot z28.s, z21.b, z5.b[0]\n"
+ "sdot z29.s, z21.b, z5.b[2]\n"
+ "ld1w { z14.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z30.s, z21.b, z6.b[0]\n"
+ "sdot z31.s, z21.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
"sdot z24.s, z18.b, z3.b[1]\n"
"sdot z25.s, z18.b, z3.b[3]\n"
- "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
"sdot z26.s, z18.b, z4.b[1]\n"
"sdot z27.s, z18.b, z4.b[3]\n"
- "sdot z28.s, z19.b, z5.b[0]\n"
- "sdot z29.s, z19.b, z5.b[2]\n"
- "sdot z30.s, z19.b, z6.b[0]\n"
- "sdot z31.s, z19.b, z6.b[2]\n"
- "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
- "sdot z24.s, z17.b, z4.b[0]\n"
- "sdot z25.s, z17.b, z4.b[2]\n"
- "sdot z26.s, z17.b, z5.b[0]\n"
- "sdot z27.s, z17.b, z5.b[2]\n"
"sdot z28.s, z18.b, z5.b[1]\n"
"sdot z29.s, z18.b, z5.b[3]\n"
"sdot z30.s, z18.b, z6.b[1]\n"
"sdot z31.s, z18.b, z6.b[3]\n"
"ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
- "sdot z24.s, z16.b, z4.b[1]\n"
- "sdot z25.s, z16.b, z4.b[3]\n"
- ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
- "sdot z26.s, z16.b, z5.b[1]\n"
- "sdot z27.s, z16.b, z5.b[3]\n"
- ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "sdot z24.s, z17.b, z4.b[0]\n"
+ "sdot z25.s, z17.b, z4.b[2]\n"
+ "sdot z26.s, z17.b, z5.b[0]\n"
+ "sdot z27.s, z17.b, z5.b[2]\n"
"sdot z28.s, z17.b, z6.b[0]\n"
"sdot z29.s, z17.b, z6.b[2]\n"
- ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
"sdot z30.s, z17.b, z7.b[0]\n"
"sdot z31.s, z17.b, z7.b[2]\n"
- ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
"ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
- "sdot z28.s, z16.b, z6.b[1]\n"
- "sdot z29.s, z16.b, z6.b[3]\n"
- "and z19.d, z24.d, z21.d\n"
- "sdot z30.s, z16.b, z7.b[1]\n"
- "sdot z31.s, z16.b, z7.b[3]\n"
- "and z18.d, z25.d, z21.d\n"
+ "sdot z24.s, z9.b, z4.b[1]\n"
+ "sdot z25.s, z9.b, z4.b[3]\n"
+ "sdot z26.s, z9.b, z5.b[1]\n"
+ "sdot z27.s, z9.b, z5.b[3]\n"
+ "sdot z28.s, z9.b, z6.b[1]\n"
+ "sdot z29.s, z9.b, z6.b[3]\n"
+ "sdot z30.s, z9.b, z7.b[1]\n"
+ "sdot z31.s, z9.b, z7.b[3]\n"
"ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
- "and z17.d, z26.d, z21.d\n"
- "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b47718 // sqrdmulh z24.s, z24.s, z20.s\n"
+ ".inst 0x04b47739 // sqrdmulh z25.s, z25.s, z20.s\n"
+ ".inst 0x04b4775a // sqrdmulh z26.s, z26.s, z20.s\n"
+ ".inst 0x04b4777b // sqrdmulh z27.s, z27.s, z20.s\n"
+ ".inst 0x04b4779c // sqrdmulh z28.s, z28.s, z20.s\n"
+ ".inst 0x04b477bd // sqrdmulh z29.s, z29.s, z20.s\n"
+ "and z17.d, z24.d, z13.d\n"
+ "and z18.d, z25.d, z13.d\n"
+ "and z15.d, z26.d, z13.d\n"
+ "and z21.d, z27.d, z13.d\n"
+ ".inst 0x04b477de // sqrdmulh z30.s, z30.s, z20.s\n"
+ ".inst 0x04b477ff // sqrdmulh z31.s, z31.s, z20.s\n"
"asr z17.s, z17.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
- ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
- ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
- ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
- "sqadd z24.s, z24.s, z19.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ "and z20.d, z28.d, z13.d\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "sqadd z26.s, z26.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
- "and z17.d, z30.d, z21.d\n"
- "and z16.d, z31.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
+ "and z18.d, z29.d, z13.d\n"
+ "sqadd z26.s, z26.s, z15.s\n"
+ "sqadd z27.s, z27.s, z21.s\n"
+ "and z17.d, z30.d, z13.d\n"
+ "and z15.d, z31.d, z13.d\n"
+ ".inst 0x448289b8 // srshl z24.s, p2/M, z24.s, z13.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x448289b9 // srshl z25.s, p2/M, z25.s, z13.s\n"
+ ".inst 0x448289ba // srshl z26.s, p2/M, z26.s, z13.s\n"
+ ".inst 0x448289bb // srshl z27.s, p2/M, z27.s, z13.s\n"
"asr z17.s, z17.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z28.s, z28.s, z19.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "add z24.s, z24.s, z16.s\n"
+ "sqadd z28.s, z28.s, z20.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z25.s, z25.s, z16.s\n"
"sqadd z30.s, z30.s, z17.s\n"
- "sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "smin z24.s, p2/M, z24.s, z15.s\n"
- "smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "smin z26.s, p2/M, z26.s, z15.s\n"
- "smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "smin z28.s, p2/M, z28.s, z15.s\n"
- "smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
- "smin z30.s, p2/M, z30.s, z15.s\n"
- "smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z14.s\n"
- "smax z25.s, p2/M, z25.s, z14.s\n"
+ "sqadd z31.s, z31.s, z15.s\n"
+ ".inst 0x448289bc // srshl z28.s, p2/M, z28.s, z13.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "smin z24.s, p2/M, z24.s, z19.s\n"
+ ".inst 0x448289bd // srshl z29.s, p2/M, z29.s, z13.s\n"
+ "smin z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x448289be // srshl z30.s, p2/M, z30.s, z13.s\n"
+ ".inst 0x448289bf // srshl z31.s, p2/M, z31.s, z13.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "smin z26.s, p2/M, z26.s, z19.s\n"
+ "smin z27.s, p2/M, z27.s, z19.s\n"
+ "smax z24.s, p2/M, z24.s, z12.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "smax z25.s, p2/M, z25.s, z12.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z19.s\n"
+ "smin z29.s, p2/M, z29.s, z19.s\n"
+ "smax z26.s, p2/M, z26.s, z12.s\n"
+ "smin z30.s, p2/M, z30.s, z19.s\n"
+ "smax z27.s, p2/M, z27.s, z12.s\n"
"st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z14.s\n"
- "smax z27.s, p2/M, z27.s, z14.s\n"
+ "smin z31.s, p2/M, z31.s, z19.s\n"
+ "smax z28.s, p2/M, z28.s, z12.s\n"
"st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z14.s\n"
- "smax z29.s, p2/M, z29.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z12.s\n"
"st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z14.s\n"
- "smax z31.s, p2/M, z31.s, z14.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "smax z30.s, p2/M, z30.s, z12.s\n"
"st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
+ "add z25.s, z25.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z12.s\n"
"st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z20.s\n"
+ "add z26.s, z26.s, z14.s\n"
"st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z20.s\n"
+ "add z27.s, z27.s, z14.s\n"
"st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z20.s\n"
+ "add z28.s, z28.s, z14.s\n"
"st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z20.s\n"
- "add z28.s, z28.s, z20.s\n"
- "add z29.s, z29.s, z20.s\n"
- "add z30.s, z30.s, z20.s\n"
- "add z31.s, z31.s, z20.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index d9c8644fc4..0d0f3d76f9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,400 +33,400 @@ namespace depthwise {
void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
{
__asm__ __volatile__(
- "mov x13, #0x0\n"
- "whilelt p0.b, x13, %x[n_channels]\n"
+ "mov x14, #0x0\n"
"ldp x27, x26, [%x[inptrs], #0x0]\n"
"ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ptrue p2.b\n"
"ldp x23, x22, [%x[inptrs], #0x20]\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "ptrue p2.b\n"
- "mov x12, #0x0\n"
- "ldp x11, x10, [%x[outptrs], #0x0]\n"
- "ldp x9, x28, [%x[outptrs], #0x10]\n"
- "ld1b { z15.b }, p0/Z, [x27, x13]\n"
- "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z12.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z17.b }, p0/Z, [x26, x14]\n"
"ldp x27, x26, [%x[inptrs], #0x40]\n"
- "ld1b { z16.b }, p0/Z, [x25, x13]\n"
- "zip2 z17.b, z15.b, z16.b\n"
- "zip1 z15.b, z15.b, z16.b\n"
- "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z15.b }, p0/Z, [x24, x14]\n"
"ldp x25, x24, [%x[inptrs], #0x50]\n"
- "zip1 z16.b, z18.b, z14.b\n"
- "zip2 z14.b, z18.b, z14.b\n"
- "ld1b { z13.b }, p0/Z, [x23, x13]\n"
- "ld1b { z18.b }, p0/Z, [x22, x13]\n"
- "zip2 z12.b, z15.b, z16.b\n"
- "zip1 z15.b, z15.b, z16.b\n"
+ "ld1b { z10.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
- "ld1b { z16.b }, p0/Z, [x21, x13]\n"
- "zip1 z11.b, z17.b, z14.b\n"
- "zip2 z14.b, z17.b, z14.b\n"
- "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ "ld1b { z19.b }, p0/Z, [x21, x14]\n"
+ "zip2 z18.b, z12.b, z16.b\n"
+ "zip1 z12.b, z12.b, z16.b\n"
+ "ld1b { z8.b }, p0/Z, [x20, x14]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip2 z22.b, z13.b, z16.b\n"
- "zip1 z13.b, z13.b, z16.b\n"
- "ld1b { z9.b }, p0/Z, [x27, x13]\n"
- "ld1b { z17.b }, p0/Z, [x26, x13]\n"
- "zip1 z21.b, z18.b, z10.b\n"
- "zip2 z10.b, z18.b, z10.b\n"
- "ld1b { z16.b }, p0/Z, [x25, x13]\n"
- "ld1b { z8.b }, p0/Z, [x24, x13]\n"
- "zip2 z20.b, z9.b, z16.b\n"
- "zip1 z9.b, z9.b, z16.b\n"
- "ld1b { z7.b }, p0/Z, [x23, x13]\n"
- "ld1b { z19.b }, p0/Z, [x22, x13]\n"
- "zip1 z18.b, z17.b, z8.b\n"
- "zip2 z8.b, z17.b, z8.b\n"
- "ld1b { z16.b }, p0/Z, [x21, x13]\n"
- "ld1b { z6.b }, p0/Z, [x20, x13]\n"
- "zip2 z17.b, z7.b, z16.b\n"
- "zip1 z7.b, z7.b, z16.b\n"
- "zip1 z16.b, z19.b, z6.b\n"
- "zip2 z6.b, z19.b, z6.b\n"
- "ld1w { z5.s }, p2/Z, [%x[params]]\n"
- "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip2 z1.b, z13.b, z21.b\n"
- "zip1 z13.b, z13.b, z21.b\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
- "zip1 z0.b, z22.b, z10.b\n"
- "zip2 z10.b, z22.b, z10.b\n"
+ "zip1 z16.b, z17.b, z15.b\n"
+ "zip2 z15.b, z17.b, z15.b\n"
+ "ld1b { z2.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z17.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x24, x14]\n"
+ "zip2 z22.b, z10.b, z19.b\n"
+ "zip1 z10.b, z10.b, z19.b\n"
+ "ld1b { z4.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x22, x14]\n"
+ "zip2 z5.b, z12.b, z16.b\n"
+ "zip1 z12.b, z12.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z6.b }, p0/Z, [x20, x14]\n"
+ "zip1 z9.b, z18.b, z15.b\n"
+ "zip2 z15.b, z18.b, z15.b\n"
+ "zip1 z20.b, z24.b, z8.b\n"
+ "zip2 z8.b, z24.b, z8.b\n"
+ "ld1w { z13.s }, p2/Z, [%x[params]]\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "zip2 z19.b, z2.b, z17.b\n"
+ "zip1 z2.b, z2.b, z17.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip1 z18.b, z23.b, z7.b\n"
+ "zip2 z7.b, z23.b, z7.b\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "zip2 z31.b, z9.b, z18.b\n"
- "zip1 z9.b, z9.b, z18.b\n"
- "zip1 z30.b, z20.b, z8.b\n"
- "zip2 z8.b, z20.b, z8.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "zip2 z27.b, z7.b, z16.b\n"
- "zip1 z7.b, z7.b, z16.b\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "ld1b { z0.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip2 z17.b, z4.b, z16.b\n"
+ "zip1 z4.b, z4.b, z16.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
"addvl %x[params], %x[params], #4\n"
- "zip1 z25.b, z17.b, z6.b\n"
+ "zip1 z16.b, z21.b, z6.b\n"
+ "zip2 z6.b, z21.b, z6.b\n"
+ "zip2 z31.b, z10.b, z20.b\n"
+ "zip1 z10.b, z10.b, z20.b\n"
+ "zip1 z26.b, z22.b, z8.b\n"
+ "zip2 z8.b, z22.b, z8.b\n"
+ "zip2 z25.b, z2.b, z18.b\n"
+ "zip1 z2.b, z2.b, z18.b\n"
+ "zip1 z28.b, z19.b, z7.b\n"
+ "zip2 z7.b, z19.b, z7.b\n"
+ "zip2 z27.b, z4.b, z16.b\n"
+ "zip1 z4.b, z4.b, z16.b\n"
+ "zip1 z29.b, z17.b, z6.b\n"
"zip2 z6.b, z17.b, z6.b\n"
- "mov z24.d, z5.d\n"
- "mov z22.d, z5.d\n"
- "mov z21.d, z5.d\n"
+ "mov z21.d, z13.d\n"
+ "mov z20.d, z13.d\n"
+ "mov z23.d, z13.d\n"
"1:" // Loop
- "sdot z5.s, z29.b, z15.b\n"
- "sdot z22.s, z29.b, z13.b\n"
- "ext z15.b, z15.b, z15.b, #0x1\n"
- "whilelt p0.s, x12, %x[n_channels]\n"
- "sdot z5.s, z28.b, z13.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "sdot z24.s, z29.b, z15.b\n"
+ "sdot z13.s, z3.b, z12.b\n"
+ "sdot z20.s, z3.b, z10.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "incw x14, ALL, MUL #4\n"
+ "sdot z21.s, z3.b, z12.b\n"
"ld1w { z17.s }, p2/Z, [%x[params]]\n"
- "sdot z21.s, z29.b, z13.b\n"
- "sdot z22.s, z28.b, z9.b\n"
- "incw x13, ALL, MUL #4\n"
- "sdot z5.s, z26.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "sdot z24.s, z28.b, z13.b\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "sdot z21.s, z28.b, z9.b\n"
- "sdot z22.s, z26.b, z7.b\n"
- "ext z7.b, z7.b, z7.b, #0x1\n"
- ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
- "sdot z24.s, z26.b, z9.b\n"
- "sdot z21.s, z26.b, z7.b\n"
- "and z16.d, z5.d, z20.d\n"
+ "sdot z13.s, z0.b, z10.b\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "sdot z20.s, z0.b, z2.b\n"
+ "sdot z23.s, z3.b, z10.b\n"
+ "sdot z13.s, z1.b, z2.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z21.s, z0.b, z10.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z23.s, z0.b, z2.b\n"
+ "sdot z20.s, z1.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
+ "sdot z21.s, z1.b, z2.b\n"
+ "sdot z23.s, z1.b, z4.b\n"
+ "and z16.d, z13.d, z22.d\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
- ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
- "sqadd z5.s, z5.s, z16.s\n"
- ".inst 0x44828a85 // srshl z5.s, p2/M, z5.s, z20.s\n"
+ ".inst 0x04b176f7 // sqrdmulh z23.s, z23.s, z17.s\n"
"ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z18.d, z24.d, z20.d\n"
- "and z17.d, z22.d, z20.d\n"
- "and z16.d, z21.d, z20.d\n"
+ "and z18.d, z20.d, z22.d\n"
+ "sqadd z13.s, z13.s, z16.s\n"
+ "and z17.d, z21.d, z22.d\n"
+ "and z16.d, z23.d, z22.d\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44828acd // srshl z13.s, p2/M, z13.s, z22.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "add z5.s, z5.s, z2.s\n"
- ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
- "smax z5.s, p2/M, z5.s, z4.s\n"
- "add z24.s, z24.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
- "smin z5.s, p2/M, z5.s, z3.s\n"
- "smax z24.s, p2/M, z24.s, z4.s\n"
- "add z21.s, z21.s, z2.s\n"
- "smax z22.s, p2/M, z22.s, z4.s\n"
- "smax z21.s, p2/M, z21.s, z4.s\n"
- "st1b { z5.s }, p0, [x11, x12]\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "smin z24.s, p2/M, z24.s, z3.s\n"
- "smin z22.s, p2/M, z22.s, z3.s\n"
- "smin z21.s, p2/M, z21.s, z3.s\n"
- "st1b { z24.s }, p0, [x10, x12]\n"
- "mov z24.d, z23.d\n"
+ "sqadd z20.s, z20.s, z18.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "sqadd z21.s, z21.s, z17.s\n"
"ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z22.s }, p0, [x9, x12]\n"
- "mov z22.d, z23.d\n"
- "sdot z22.s, z18.b, z1.b\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z21.s }, p0, [x28, x12]\n"
- "mov z21.d, z23.d\n"
- "sdot z23.s, z18.b, z12.b\n"
- "sdot z23.s, z17.b, z1.b\n"
- "ext z12.b, z12.b, z12.b, #0x1\n"
- "ext z1.b, z1.b, z1.b, #0x1\n"
- "sdot z24.s, z18.b, z12.b\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
- "sdot z21.s, z18.b, z1.b\n"
- "sdot z22.s, z17.b, z31.b\n"
- "incw x12\n"
- "whilelt p0.s, x12, %x[n_channels]\n"
- "sdot z23.s, z16.b, z31.b\n"
- "ext z31.b, z31.b, z31.b, #0x1\n"
- "sdot z24.s, z17.b, z1.b\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "add z13.s, z13.s, z30.s\n"
+ ".inst 0x44828ad5 // srshl z21.s, p2/M, z21.s, z22.s\n"
+ ".inst 0x44828ad4 // srshl z20.s, p2/M, z20.s, z22.s\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "smax z13.s, p2/M, z13.s, z11.s\n"
+ "add z21.s, z21.s, z30.s\n"
+ "add z20.s, z20.s, z30.s\n"
+ "add z23.s, z23.s, z30.s\n"
+ "smin z13.s, p2/M, z13.s, z14.s\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smax z20.s, p2/M, z20.s, z11.s\n"
+ "smax z23.s, p2/M, z23.s, z11.s\n"
+ "st1b { z13.s }, p0, [x12, x13]\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "sdot z21.s, z17.b, z31.b\n"
- "sdot z22.s, z16.b, z27.b\n"
+ "smin z21.s, p2/M, z21.s, z14.s\n"
+ "smin z20.s, p2/M, z20.s, z14.s\n"
+ "smin z23.s, p2/M, z23.s, z14.s\n"
+ "st1b { z21.s }, p0, [x11, x13]\n"
+ "mov z13.d, z24.d\n"
+ "st1b { z20.s }, p0, [x10, x13]\n"
+ "mov z21.d, z24.d\n"
+ "st1b { z23.s }, p0, [x9, x13]\n"
+ "mov z20.d, z24.d\n"
+ "sdot z24.s, z16.b, z5.b\n"
+ "incw x13\n"
+ "sdot z21.s, z16.b, z31.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "sdot z24.s, z17.b, z31.b\n"
+ "ext z31.b, z31.b, z31.b, #0x1\n"
+ "sdot z13.s, z16.b, z5.b\n"
+ "sdot z20.s, z16.b, z31.b\n"
+ "sdot z21.s, z17.b, z25.b\n"
+ "sdot z24.s, z18.b, z25.b\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ "sdot z13.s, z17.b, z31.b\n"
+ "sdot z20.s, z17.b, z25.b\n"
+ "sdot z21.s, z18.b, z27.b\n"
"ext z27.b, z27.b, z27.b, #0x1\n"
- ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
- "sdot z24.s, z16.b, z31.b\n"
- "sdot z21.s, z16.b, z27.b\n"
- "and z16.d, z23.d, z20.d\n"
- "asr z16.s, z16.s, #0x1f\n"
".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ "sdot z13.s, z18.b, z25.b\n"
+ "sdot z20.s, z18.b, z27.b\n"
+ "and z16.d, z24.d, z22.d\n"
".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
"ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "and z18.d, z24.d, z20.d\n"
- "and z17.d, z22.d, z20.d\n"
- "and z16.d, z21.d, z20.d\n"
+ "and z18.d, z21.d, z22.d\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ "and z17.d, z13.d, z22.d\n"
+ "and z16.d, z20.d, z22.d\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44828ad8 // srshl z24.s, p2/M, z24.s, z22.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "add z23.s, z23.s, z2.s\n"
- ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
- "smax z23.s, p2/M, z23.s, z4.s\n"
- "add z24.s, z24.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
- "smin z23.s, p2/M, z23.s, z3.s\n"
- "smax z24.s, p2/M, z24.s, z4.s\n"
- "add z21.s, z21.s, z2.s\n"
- "smax z22.s, p2/M, z22.s, z4.s\n"
- "smax z21.s, p2/M, z21.s, z4.s\n"
- "st1b { z23.s }, p0, [x11, x12]\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smin z24.s, p2/M, z24.s, z3.s\n"
- "smin z22.s, p2/M, z22.s, z3.s\n"
- "smin z21.s, p2/M, z21.s, z3.s\n"
- "st1b { z24.s }, p0, [x10, x12]\n"
- "mov z24.d, z23.d\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "sqadd z13.s, z13.s, z17.s\n"
"ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
- "st1b { z22.s }, p0, [x9, x12]\n"
- "mov z22.d, z23.d\n"
- "sdot z22.s, z18.b, z0.b\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "st1b { z21.s }, p0, [x28, x12]\n"
- "mov z21.d, z23.d\n"
- "sdot z23.s, z18.b, z11.b\n"
- "sdot z23.s, z17.b, z0.b\n"
- "ext z11.b, z11.b, z11.b, #0x1\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "sdot z24.s, z18.b, z11.b\n"
- "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "sdot z21.s, z18.b, z0.b\n"
- "sdot z22.s, z17.b, z30.b\n"
- "incw x12\n"
- "whilelt p0.s, x12, %x[n_channels]\n"
- "sdot z23.s, z16.b, z30.b\n"
- "ext z30.b, z30.b, z30.b, #0x1\n"
- "sdot z24.s, z17.b, z0.b\n"
- "sdot z21.s, z17.b, z30.b\n"
- "sdot z22.s, z16.b, z25.b\n"
- "ext z25.b, z25.b, z25.b, #0x1\n"
- ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
- "sdot z24.s, z16.b, z30.b\n"
- "sdot z21.s, z16.b, z25.b\n"
- "and z16.d, z23.d, z20.d\n"
- "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z16.s\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "add z24.s, z24.s, z30.s\n"
+ ".inst 0x44828acd // srshl z13.s, p2/M, z13.s, z22.s\n"
+ ".inst 0x44828ad5 // srshl z21.s, p2/M, z21.s, z22.s\n"
+ ".inst 0x44828ad4 // srshl z20.s, p2/M, z20.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "add z13.s, z13.s, z30.s\n"
+ "add z21.s, z21.s, z30.s\n"
+ "add z20.s, z20.s, z30.s\n"
+ "smin z24.s, p2/M, z24.s, z14.s\n"
+ "smax z13.s, p2/M, z13.s, z11.s\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smax z20.s, p2/M, z20.s, z11.s\n"
+ "st1b { z24.s }, p0, [x12, x13]\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "smin z13.s, p2/M, z13.s, z14.s\n"
+ "smin z21.s, p2/M, z21.s, z14.s\n"
+ "smin z20.s, p2/M, z20.s, z14.s\n"
+ "st1b { z13.s }, p0, [x11, x13]\n"
+ "mov z23.d, z24.d\n"
+ "st1b { z21.s }, p0, [x10, x13]\n"
+ "mov z21.d, z24.d\n"
+ "st1b { z20.s }, p0, [x9, x13]\n"
+ "mov z20.d, z24.d\n"
+ "sdot z24.s, z16.b, z9.b\n"
+ "incw x13\n"
+ "sdot z21.s, z16.b, z26.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "sdot z24.s, z17.b, z26.b\n"
+ "ext z26.b, z26.b, z26.b, #0x1\n"
+ "sdot z23.s, z16.b, z9.b\n"
+ "sdot z20.s, z16.b, z26.b\n"
+ "sdot z21.s, z17.b, z28.b\n"
+ "sdot z24.s, z18.b, z28.b\n"
+ "ext z28.b, z28.b, z28.b, #0x1\n"
+ "sdot z23.s, z17.b, z26.b\n"
+ "sdot z20.s, z17.b, z28.b\n"
+ "sdot z21.s, z18.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
- ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ "sdot z23.s, z18.b, z28.b\n"
+ "sdot z20.s, z18.b, z29.b\n"
+ "and z16.d, z24.d, z22.d\n"
".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
"ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "and z18.d, z24.d, z20.d\n"
- "and z17.d, z22.d, z20.d\n"
- "and z16.d, z21.d, z20.d\n"
+ "and z18.d, z21.d, z22.d\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ "and z17.d, z23.d, z22.d\n"
+ "and z16.d, z20.d, z22.d\n"
"asr z18.s, z18.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44828ad8 // srshl z24.s, p2/M, z24.s, z22.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z24.s, z24.s, z18.s\n"
- "sqadd z22.s, z22.s, z17.s\n"
- ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
- ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
- "sqadd z21.s, z21.s, z16.s\n"
- "add z23.s, z23.s, z2.s\n"
- ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
- "smax z23.s, p2/M, z23.s, z4.s\n"
- "add z24.s, z24.s, z2.s\n"
- "add z22.s, z22.s, z2.s\n"
- "smin z23.s, p2/M, z23.s, z3.s\n"
- "smax z24.s, p2/M, z24.s, z4.s\n"
- "add z21.s, z21.s, z2.s\n"
- "smax z22.s, p2/M, z22.s, z4.s\n"
- "smax z21.s, p2/M, z21.s, z4.s\n"
- "st1b { z23.s }, p0, [x11, x12]\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- "smin z24.s, p2/M, z24.s, z3.s\n"
- "smin z22.s, p2/M, z22.s, z3.s\n"
- "smin z21.s, p2/M, z21.s, z3.s\n"
- "st1b { z24.s }, p0, [x10, x12]\n"
- "mov z29.d, z23.d\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sqadd z23.s, z23.s, z17.s\n"
"ld1b { z17.b }, p2/Z, [%x[params]]\n"
- "st1b { z22.s }, p0, [x9, x12]\n"
- "mov z28.d, z23.d\n"
- "sdot z28.s, z18.b, z10.b\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "st1b { z21.s }, p0, [x28, x12]\n"
- "mov z27.d, z23.d\n"
- "sdot z23.s, z18.b, z14.b\n"
- "sdot z23.s, z17.b, z10.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "ext z10.b, z10.b, z10.b, #0x1\n"
- "sdot z29.s, z18.b, z14.b\n"
+ "sqadd z20.s, z20.s, z16.s\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "add z24.s, z24.s, z30.s\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ ".inst 0x44828ad5 // srshl z21.s, p2/M, z21.s, z22.s\n"
+ ".inst 0x44828ad4 // srshl z20.s, p2/M, z20.s, z22.s\n"
"ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "sdot z27.s, z18.b, z10.b\n"
- "sdot z28.s, z17.b, z8.b\n"
- "incw x12\n"
- "whilelt p1.s, x12, %x[n_channels]\n"
- "sdot z23.s, z16.b, z8.b\n"
+ "smax z24.s, p2/M, z24.s, z11.s\n"
+ "add z23.s, z23.s, z30.s\n"
+ "add z21.s, z21.s, z30.s\n"
+ "add z20.s, z20.s, z30.s\n"
+ "smin z24.s, p2/M, z24.s, z14.s\n"
+ "smax z23.s, p2/M, z23.s, z11.s\n"
+ "smax z21.s, p2/M, z21.s, z11.s\n"
+ "smax z20.s, p2/M, z20.s, z11.s\n"
+ "st1b { z24.s }, p0, [x12, x13]\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "smin z23.s, p2/M, z23.s, z14.s\n"
+ "smin z21.s, p2/M, z21.s, z14.s\n"
+ "smin z20.s, p2/M, z20.s, z14.s\n"
+ "st1b { z23.s }, p0, [x11, x13]\n"
+ "mov z29.d, z13.d\n"
+ "st1b { z21.s }, p0, [x10, x13]\n"
+ "mov z28.d, z13.d\n"
+ "st1b { z20.s }, p0, [x9, x13]\n"
+ "mov z27.d, z13.d\n"
+ "sdot z13.s, z16.b, z15.b\n"
+ "incw x13\n"
+ "sdot z28.s, z16.b, z8.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "sdot z13.s, z17.b, z8.b\n"
"ext z8.b, z8.b, z8.b, #0x1\n"
- "sdot z29.s, z17.b, z10.b\n"
- "whilelt p0.b, x13, %x[n_channels]\n"
- "sdot z27.s, z17.b, z8.b\n"
- "sdot z28.s, z16.b, z6.b\n"
+ "sdot z29.s, z16.b, z15.b\n"
+ "ld1b { z26.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z15.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x14]\n"
+ "sdot z27.s, z16.b, z8.b\n"
+ "sdot z28.s, z17.b, z7.b\n"
+ "sdot z13.s, z18.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ "sdot z29.s, z17.b, z8.b\n"
+ "ld1b { z8.b }, p0/Z, [x20, x14]\n"
+ "sdot z27.s, z17.b, z7.b\n"
+ "sdot z28.s, z18.b, z6.b\n"
"ext z6.b, z6.b, z6.b, #0x1\n"
- "ld1b { z26.b }, p0/Z, [x26, x13]\n"
- ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
- "sdot z29.s, z16.b, z8.b\n"
- "sdot z27.s, z16.b, z6.b\n"
- "ld1b { z21.b }, p0/Z, [x25, x13]\n"
- "and z16.d, z23.d, z22.d\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ "sdot z29.s, z18.b, z7.b\n"
+ "sdot z27.s, z18.b, z6.b\n"
+ "and z16.d, z13.d, z22.d\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- "ld1b { z14.b }, p0/Z, [x23, x13]\n"
- "ld1b { z25.b }, p0/Z, [x22, x13]\n"
".inst 0x04b377bd // sqrdmulh z29.s, z29.s, z19.s\n"
- ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
- "ld1b { z20.b }, p0/Z, [x21, x13]\n"
- "ld1b { z10.b }, p0/Z, [x20, x13]\n"
".inst 0x04b3777b // sqrdmulh z27.s, z27.s, z19.s\n"
- "sqadd z23.s, z23.s, z16.s\n"
- ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
- "ld1b { z15.b }, p0/Z, [x27, x13]\n"
- "and z19.d, z29.d, z22.d\n"
- "and z17.d, z28.d, z22.d\n"
+ "ld1b { z12.b }, p0/Z, [x28, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "and z19.d, z28.d, z22.d\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "sqadd z13.s, z13.s, z16.s\n"
+ "and z17.d, z29.d, z22.d\n"
"and z16.d, z27.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "ld1b { z9.b }, p0/Z, [x23, x13]\n"
- "ld1b { z24.b }, p0/Z, [x22, x13]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
"asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x44828acd // srshl z13.s, p2/M, z13.s, z22.s\n"
+ "ld1b { z18.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x20, x14]\n"
"asr z16.s, z16.s, #0x1f\n"
- "ld1b { z18.b }, p0/Z, [x21, x13]\n"
- "ld1b { z8.b }, p0/Z, [x20, x13]\n"
- "sqadd z29.s, z29.s, z19.s\n"
- "sqadd z28.s, z28.s, z17.s\n"
- ".inst 0x44828add // srshl z29.s, p2/M, z29.s, z22.s\n"
- ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sqadd z29.s, z29.s, z17.s\n"
+ "ld1b { z0.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "add z13.s, z13.s, z30.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- "add z23.s, z23.s, z2.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ ".inst 0x44828add // srshl z29.s, p2/M, z29.s, z22.s\n"
+ "smax z13.s, p2/M, z13.s, z11.s\n"
".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
- "smax z23.s, p2/M, z23.s, z4.s\n"
- "add z29.s, z29.s, z2.s\n"
- "add z28.s, z28.s, z2.s\n"
- "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+ "ld1b { z10.b }, p0/Z, [x24, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
- "add z27.s, z27.s, z2.s\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "smin z23.s, p2/M, z23.s, z3.s\n"
- "smax z29.s, p2/M, z29.s, z4.s\n"
- "smax z28.s, p2/M, z28.s, z4.s\n"
- "smax z27.s, p2/M, z27.s, z4.s\n"
- "st1b { z23.s }, p1, [x11, x12]\n"
- "ld1b { z7.b }, p0/Z, [x23, x13]\n"
- "ld1b { z23.b }, p0/Z, [x22, x13]\n"
- "ld1b { z22.b }, p0/Z, [x21, x13]\n"
- "zip2 z17.b, z15.b, z21.b\n"
- "zip1 z15.b, z15.b, z21.b\n"
- "ld1b { z6.b }, p0/Z, [x20, x13]\n"
- "zip1 z16.b, z26.b, z14.b\n"
- "zip2 z14.b, z26.b, z14.b\n"
- "smin z29.s, p2/M, z29.s, z3.s\n"
- "smin z28.s, p2/M, z28.s, z3.s\n"
- "smin z27.s, p2/M, z27.s, z3.s\n"
- "st1b { z29.s }, p1, [x10, x12]\n"
- "zip2 z12.b, z15.b, z16.b\n"
- "st1b { z28.s }, p1, [x9, x12]\n"
- "zip1 z15.b, z15.b, z16.b\n"
- "zip1 z11.b, z17.b, z14.b\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "st1b { z27.s }, p1, [x28, x12]\n"
- "zip2 z14.b, z17.b, z14.b\n"
- "zip2 z21.b, z13.b, z20.b\n"
- "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "zip1 z13.b, z13.b, z20.b\n"
- "zip1 z20.b, z25.b, z10.b\n"
- "incw x12\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
- "zip2 z10.b, z25.b, z10.b\n"
- "zip2 z19.b, z9.b, z18.b\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "add z29.s, z29.s, z30.s\n"
+ "add z28.s, z28.s, z30.s\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "add z27.s, z27.s, z30.s\n"
+ "smin z13.s, p2/M, z13.s, z14.s\n"
+ "ld1b { z4.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "smax z29.s, p2/M, z29.s, z11.s\n"
+ "smax z28.s, p2/M, z28.s, z11.s\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z6.b }, p0/Z, [x20, x14]\n"
+ "smax z27.s, p2/M, z27.s, z11.s\n"
+ "st1b { z13.s }, p1, [x12, x13]\n"
+ "zip2 z17.b, z12.b, z21.b\n"
+ "zip1 z12.b, z12.b, z21.b\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "zip1 z9.b, z9.b, z18.b\n"
- "zip1 z18.b, z24.b, z8.b\n"
- "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
- "zip2 z8.b, z24.b, z8.b\n"
- "zip2 z17.b, z7.b, z22.b\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "zip1 z16.b, z26.b, z15.b\n"
+ "zip2 z15.b, z26.b, z15.b\n"
+ "smin z29.s, p2/M, z29.s, z14.s\n"
+ "smin z28.s, p2/M, z28.s, z14.s\n"
+ "smin z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z29.s }, p1, [x11, x13]\n"
+ "zip2 z21.b, z10.b, z20.b\n"
+ "zip1 z10.b, z10.b, z20.b\n"
+ "zip1 z20.b, z25.b, z8.b\n"
+ "zip2 z8.b, z25.b, z8.b\n"
+ "st1b { z28.s }, p1, [x10, x13]\n"
+ "zip2 z5.b, z12.b, z16.b\n"
+ "zip1 z12.b, z12.b, z16.b\n"
+ "st1b { z27.s }, p1, [x9, x13]\n"
+ "incw x13\n"
+ "zip1 z9.b, z17.b, z15.b\n"
+ "zip2 z15.b, z17.b, z15.b\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip1 z7.b, z7.b, z22.b\n"
+ "zip2 z19.b, z2.b, z18.b\n"
+ "zip1 z2.b, z2.b, z18.b\n"
+ "zip1 z18.b, z24.b, z7.b\n"
+ "zip2 z7.b, z24.b, z7.b\n"
+ "zip2 z17.b, z4.b, z22.b\n"
+ "zip1 z4.b, z4.b, z22.b\n"
"zip1 z16.b, z23.b, z6.b\n"
"zip2 z6.b, z23.b, z6.b\n"
- "zip2 z1.b, z13.b, z20.b\n"
- "zip1 z13.b, z13.b, z20.b\n"
- "zip1 z0.b, z21.b, z10.b\n"
- "zip2 z10.b, z21.b, z10.b\n"
- "zip2 z31.b, z9.b, z18.b\n"
- "zip1 z9.b, z9.b, z18.b\n"
- "zip1 z30.b, z19.b, z8.b\n"
- "zip2 z8.b, z19.b, z8.b\n"
- "zip2 z27.b, z7.b, z16.b\n"
- "zip1 z7.b, z7.b, z16.b\n"
- "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z31.b, z10.b, z20.b\n"
+ "zip1 z10.b, z10.b, z20.b\n"
+ "zip1 z26.b, z21.b, z8.b\n"
+ "zip2 z8.b, z21.b, z8.b\n"
+ "zip2 z25.b, z2.b, z18.b\n"
+ "zip1 z2.b, z2.b, z18.b\n"
+ "zip1 z28.b, z19.b, z7.b\n"
+ "zip2 z7.b, z19.b, z7.b\n"
+ "zip2 z27.b, z4.b, z16.b\n"
+ "zip1 z4.b, z4.b, z16.b\n"
+ "zip1 z29.b, z17.b, z6.b\n"
"zip2 z6.b, z17.b, z6.b\n"
- "mov z24.d, z5.d\n"
- "mov z22.d, z5.d\n"
- "mov z21.d, z5.d\n"
+ "mov z21.d, z13.d\n"
+ "mov z20.d, z13.d\n"
+ "mov z23.d, z13.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index f0860c98b9..649540ace6 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,456 +34,456 @@ void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_chan
{
__asm__ __volatile__(
"mov x14, #0x0\n"
- "whilelt p0.b, x14, %x[n_channels]\n"
"ldp x27, x26, [%x[inptrs], #0x0]\n"
"ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "mov x28, #0x1\n"
"ldp x23, x22, [%x[inptrs], #0x20]\n"
- "ldp x13, x21, [%x[inptrs], #0x30]\n"
- "mov x20, #0x1\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
"ptrue p2.b\n"
+ "mov x13, #0x0\n"
"ldp x12, x11, [%x[outptrs], #0x0]\n"
"ldp x10, x9, [%x[outptrs], #0x10]\n"
- "orr x20, x20, #0x100\n"
- "orr x20, x20, #0x10000\n"
- "ld1b { z15.b }, p0/Z, [x27, x14]\n"
- "ld1b { z21.b }, p0/Z, [x26, x14]\n"
- "dup z25.s, w20\n"
- "mov x28, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "orr x28, x28, #0x100\n"
+ "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1b { z12.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x26, x14]\n"
+ "orr x28, x28, #0x10000\n"
"ldp x27, x26, [%x[inptrs], #0x40]\n"
- "ld1b { z31.b }, p0/Z, [x25, x14]\n"
- "zip2 z16.b, z15.b, z31.b\n"
- "zip1 z15.b, z15.b, z31.b\n"
- "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z14.b }, p0/Z, [x24, x14]\n"
"ldp x25, x24, [%x[inptrs], #0x50]\n"
- "zip1 z30.b, z21.b, z29.b\n"
- "zip2 z29.b, z21.b, z29.b\n"
- "ld1b { z9.b }, p0/Z, [x23, x14]\n"
- "ld1b { z20.b }, p0/Z, [x22, x14]\n"
- "zip2 z13.b, z15.b, z30.b\n"
- "zip1 z15.b, z15.b, z30.b\n"
+ "ld1b { z5.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z3.b }, p0/Z, [x22, x14]\n"
+ "dup z9.s, w28\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
- "ld1b { z5.b }, p0/Z, [x13, x14]\n"
- "zip1 z14.b, z16.b, z29.b\n"
- "zip2 z29.b, z16.b, z29.b\n"
- "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x21, x14]\n"
+ "zip2 z18.b, z12.b, z26.b\n"
+ "zip1 z12.b, z12.b, z26.b\n"
+ "ld1b { z30.b }, p0/Z, [x20, x14]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "zip2 z31.b, z9.b, z5.b\n"
- "zip1 z9.b, z9.b, z5.b\n"
- "ld1b { z18.b }, p0/Z, [x27, x14]\n"
- "ld1b { z28.b }, p0/Z, [x26, x14]\n"
- "zip1 z21.b, z20.b, z17.b\n"
- "zip2 z17.b, z20.b, z17.b\n"
- "ld1b { z6.b }, p0/Z, [x25, x14]\n"
- "ld1b { z4.b }, p0/Z, [x24, x14]\n"
- "zip2 z23.b, z18.b, z6.b\n"
- "zip1 z18.b, z18.b, z6.b\n"
- "ld1b { z2.b }, p0/Z, [x23, x14]\n"
- "ld1b { z19.b }, p0/Z, [x22, x14]\n"
- "zip1 z24.b, z28.b, z4.b\n"
- "zip2 z4.b, z28.b, z4.b\n"
- "ld1b { z16.b }, p0/Z, [x21, x14]\n"
- "ld1b { z5.b }, p0/Z, [x20, x14]\n"
- "zip2 z22.b, z2.b, z16.b\n"
- "zip1 z2.b, z2.b, z16.b\n"
- "zip1 z0.b, z19.b, z5.b\n"
- "zip2 z5.b, z19.b, z5.b\n"
- "ld1w { z10.s }, p2/Z, [%x[params]]\n"
- "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "zip2 z19.b, z9.b, z21.b\n"
- "zip1 z9.b, z9.b, z21.b\n"
- "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "zip1 z11.b, z31.b, z17.b\n"
- "zip2 z17.b, z31.b, z17.b\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip1 z17.b, z24.b, z14.b\n"
+ "zip2 z14.b, z24.b, z14.b\n"
+ "ld1b { z29.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x24, x14]\n"
+ "zip2 z22.b, z5.b, z19.b\n"
+ "zip1 z5.b, z5.b, z19.b\n"
+ "ld1b { z6.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x22, x14]\n"
+ "zip2 z2.b, z12.b, z17.b\n"
+ "zip1 z12.b, z12.b, z17.b\n"
+ "ld1b { z23.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "zip1 z8.b, z18.b, z14.b\n"
+ "zip2 z14.b, z18.b, z14.b\n"
+ "zip1 z26.b, z3.b, z30.b\n"
+ "zip2 z30.b, z3.b, z30.b\n"
+ "ld1w { z0.s }, p2/Z, [%x[params]]\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "zip2 z24.b, z29.b, z16.b\n"
+ "zip1 z29.b, z29.b, z16.b\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
- "zip2 z12.b, z18.b, z24.b\n"
- "zip1 z18.b, z18.b, z24.b\n"
+ "zip1 z16.b, z25.b, z7.b\n"
+ "zip2 z7.b, z25.b, z7.b\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "zip1 z20.b, z23.b, z4.b\n"
- "zip2 z4.b, z23.b, z4.b\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "zip2 z24.b, z2.b, z0.b\n"
- "zip1 z2.b, z2.b, z0.b\n"
- "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "zip1 z0.b, z22.b, z5.b\n"
- "zip2 z5.b, z22.b, z5.b\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z25.b, z6.b, z23.b\n"
+ "zip1 z6.b, z6.b, z23.b\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z19.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
"addvl %x[params], %x[params], #4\n"
- "mov z22.d, z10.d\n"
- "mov z31.d, z10.d\n"
- "mov z21.d, z10.d\n"
+ "zip2 z23.b, z5.b, z26.b\n"
+ "zip1 z5.b, z5.b, z26.b\n"
+ "zip1 z3.b, z22.b, z30.b\n"
+ "zip2 z30.b, z22.b, z30.b\n"
+ "zip2 z11.b, z29.b, z16.b\n"
+ "zip1 z29.b, z29.b, z16.b\n"
+ "zip1 z16.b, z24.b, z7.b\n"
+ "zip2 z7.b, z24.b, z7.b\n"
+ "zip2 z1.b, z6.b, z19.b\n"
+ "zip1 z6.b, z6.b, z19.b\n"
+ "zip1 z27.b, z25.b, z4.b\n"
+ "zip2 z4.b, z25.b, z4.b\n"
+ "mov z26.d, z0.d\n"
+ "mov z25.d, z0.d\n"
+ "mov z28.d, z0.d\n"
"1:" // Loop
- "mov z30.s, #0x0\n"
- "udot z30.s, z25.b, z9.b\n"
- "udot z10.s, z26.b, z15.b\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "udot z30.s, z25.b, z18.b\n"
- "udot z31.s, z26.b, z9.b\n"
- "mov z27.s, #0x0\n"
- "incw x14, ALL, MUL #4\n"
- "udot z10.s, z3.b, z9.b\n"
- "ext z9.b, z9.b, z9.b, #0x1\n"
- "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
- "udot z30.s, z25.b, z15.b\n"
- "ext z15.b, z15.b, z15.b, #0x1\n"
- "udot z27.s, z25.b, z9.b\n"
- "udot z31.s, z3.b, z18.b\n"
- "udot z10.s, z1.b, z18.b\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "udot z22.s, z26.b, z15.b\n"
- "udot z21.s, z26.b, z9.b\n"
- "udot z27.s, z25.b, z18.b\n"
- "udot z31.s, z1.b, z2.b\n"
- "ext z2.b, z2.b, z2.b, #0x1\n"
- "udot z22.s, z3.b, z9.b\n"
- "udot z21.s, z3.b, z18.b\n"
- "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
- "mls z10.s, p2/M, z30.s, z8.s\n"
- "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
- "mov z9.s, #0x0\n"
- "udot z27.s, z25.b, z15.b\n"
- "ld1w { z23.s }, p2/Z, [%x[params]]\n"
- "udot z22.s, z1.b, z18.b\n"
- ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
- "udot z21.s, z1.b, z2.b\n"
- "mls z22.s, p2/M, z27.s, z8.s\n"
- "and z18.d, z10.d, z3.d\n"
- "mls z31.s, p2/M, z28.s, z8.s\n"
- "mls z21.s, p2/M, z26.s, z8.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
- ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
- "udot z9.s, z25.b, z19.b\n"
- ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
- "sqadd z10.s, z10.s, z18.s\n"
- ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
- "udot z9.s, z25.b, z12.b\n"
- "and z28.d, z22.d, z3.d\n"
- "and z23.d, z31.d, z3.d\n"
- "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
- "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "and z18.d, z21.d, z3.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "udot z9.s, z25.b, z13.b\n"
- "asr z23.s, z23.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z22.s, z22.s, z28.s\n"
- "sqadd z31.s, z31.s, z23.s\n"
- ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
- ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
- "sqadd z21.s, z21.s, z18.s\n"
- "add z10.s, z10.s, z16.s\n"
- ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
- "smax z10.s, p2/M, z10.s, z7.s\n"
- "add z22.s, z22.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "smin z10.s, p2/M, z10.s, z6.s\n"
- "smax z22.s, p2/M, z22.s, z7.s\n"
- "add z21.s, z21.s, z16.s\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "smax z21.s, p2/M, z21.s, z7.s\n"
- "st1b { z10.s }, p0, [x12, x28]\n"
- "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z21.s, p2/M, z21.s, z6.s\n"
- "st1b { z22.s }, p0, [x11, x28]\n"
- "mov z26.d, z28.d\n"
- "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z31.s }, p0, [x10, x28]\n"
- "mov z31.d, z28.d\n"
- "udot z31.s, z1.b, z19.b\n"
- "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "st1b { z21.s }, p0, [x9, x28]\n"
- "mov z22.d, z28.d\n"
- "udot z28.s, z1.b, z13.b\n"
- "udot z28.s, z15.b, z19.b\n"
- "ext z13.b, z13.b, z13.b, #0x1\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
- "udot z26.s, z1.b, z13.b\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z24.s, #0x0\n"
+ "udot z0.s, z17.b, z12.b\n"
+ "udot z25.s, z17.b, z5.b\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
"mov z18.s, #0x0\n"
- "udot z22.s, z1.b, z19.b\n"
- "udot z18.s, z25.b, z19.b\n"
- "incw x28\n"
- "udot z31.s, z15.b, z12.b\n"
- "udot z28.s, z23.b, z12.b\n"
+ "incw x14, ALL, MUL #4\n"
+ "udot z24.s, z9.b, z5.b\n"
+ "udot z0.s, z20.b, z5.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "udot z25.s, z20.b, z29.b\n"
+ "udot z24.s, z9.b, z29.b\n"
+ "udot z18.s, z9.b, z5.b\n"
+ "udot z0.s, z10.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "udot z28.s, z17.b, z5.b\n"
+ "movprfx z19, z24\n udot z19.s, z9.b, z6.b\n"
+ "udot z24.s, z9.b, z12.b\n"
"ext z12.b, z12.b, z12.b, #0x1\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "udot z26.s, z15.b, z19.b\n"
- "udot z22.s, z15.b, z12.b\n"
+ "udot z25.s, z10.b, z6.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "udot z18.s, z9.b, z29.b\n"
+ "udot z26.s, z17.b, z12.b\n"
+ "udot z28.s, z20.b, z29.b\n"
+ "mls z0.s, p2/M, z24.s, z13.s\n"
+ "mov z22.s, #0x0\n"
+ "mls z25.s, p2/M, z19.s, z13.s\n"
+ "udot z22.s, z9.b, z23.b\n"
+ "udot z26.s, z20.b, z5.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "movprfx z5, z18\n udot z5.s, z9.b, z6.b\n"
+ "udot z18.s, z9.b, z12.b\n"
+ "ld1w { z19.s }, p2/Z, [%x[params]]\n"
+ "udot z28.s, z10.b, z6.b\n"
+ "udot z22.s, z9.b, z11.b\n"
+ "udot z26.s, z10.b, z29.b\n"
+ ".inst 0x04b37400 // sqrdmulh z0.s, z0.s, z19.s\n"
+ ".inst 0x04b37739 // sqrdmulh z25.s, z25.s, z19.s\n"
+ "mls z28.s, p2/M, z5.s, z13.s\n"
+ "and z5.d, z0.d, z20.d\n"
+ "mls z26.s, p2/M, z18.s, z13.s\n"
+ "mov z18.s, #0x0\n"
+ "and z12.d, z25.d, z20.d\n"
+ "movprfx z10, z22\n udot z10.s, z9.b, z1.b\n"
+ "udot z22.s, z9.b, z2.b\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
+ ".inst 0x04b3775a // sqrdmulh z26.s, z26.s, z19.s\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "asr z12.s, z12.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z5.s\n"
+ "and z19.d, z26.d, z20.d\n"
+ "and z6.d, z28.d, z20.d\n"
+ ".inst 0x44828a80 // srshl z0.s, p2/M, z0.s, z20.s\n"
+ "sqadd z25.s, z25.s, z12.s\n"
+ "ld1b { z5.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "add z0.s, z0.s, z21.s\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ ".inst 0x44828a99 // srshl z25.s, p2/M, z25.s, z20.s\n"
+ "sqadd z28.s, z28.s, z6.s\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smax z0.s, p2/M, z0.s, z31.s\n"
+ ".inst 0x44828a9a // srshl z26.s, p2/M, z26.s, z20.s\n"
+ ".inst 0x44828a9c // srshl z28.s, p2/M, z28.s, z20.s\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z26.s, z26.s, z21.s\n"
+ "smin z0.s, p2/M, z0.s, z15.s\n"
+ "add z28.s, z28.s, z21.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "st1b { z0.s }, p0, [x12, x13]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
- "udot z18.s, z25.b, z12.b\n"
- "udot z31.s, z23.b, z24.b\n"
- "ext z24.b, z24.b, z24.b, #0x1\n"
- "mls z28.s, p2/M, z9.s, z8.s\n"
- "udot z26.s, z23.b, z12.b\n"
- ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
- "udot z22.s, z23.b, z24.b\n"
- "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
- "and z2.d, z28.d, z21.d\n"
- "udot z18.s, z25.b, z13.b\n"
- "mls z26.s, p2/M, z18.s, z8.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- "mls z31.s, p2/M, z27.s, z8.s\n"
- "mls z22.s, p2/M, z12.s, z8.s\n"
- ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
- ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
- ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
- "sqadd z28.s, z28.s, z2.s\n"
- "and z24.d, z26.d, z21.d\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- "and z23.d, z31.d, z21.d\n"
- "and z18.d, z22.d, z21.d\n"
- "asr z24.s, z24.s, #0x1f\n"
- "asr z23.s, z23.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
- "sqadd z26.s, z26.s, z24.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
- "sqadd z31.s, z31.s, z23.s\n"
- "sqadd z22.s, z22.s, z18.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
- "add z28.s, z28.s, z16.s\n"
- "smax z28.s, p2/M, z28.s, z7.s\n"
- "add z26.s, z26.s, z16.s\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "add z31.s, z31.s, z16.s\n"
- "add z22.s, z22.s, z16.s\n"
- "smax z26.s, p2/M, z26.s, z7.s\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "mov z24.s, #0x0\n"
- "udot z24.s, z25.b, z11.b\n"
- "smax z22.s, p2/M, z22.s, z7.s\n"
- "st1b { z28.s }, p0, [x12, x28]\n"
- "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
- "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z22.s, p2/M, z22.s, z6.s\n"
- "st1b { z26.s }, p0, [x11, x28]\n"
- "mov z28.d, z23.d\n"
- "udot z24.s, z25.b, z20.b\n"
- "st1b { z31.s }, p0, [x10, x28]\n"
- "mov z27.d, z23.d\n"
- "udot z27.s, z19.b, z11.b\n"
- "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
- "st1b { z22.s }, p0, [x9, x28]\n"
- "mov z26.d, z23.d\n"
- "udot z23.s, z19.b, z14.b\n"
- "udot z23.s, z30.b, z11.b\n"
- "udot z24.s, z25.b, z14.b\n"
- "ext z14.b, z14.b, z14.b, #0x1\n"
- "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
- "udot z28.s, z19.b, z14.b\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "st1b { z26.s }, p0, [x11, x13]\n"
+ "mov z6.d, z29.d\n"
+ "st1b { z25.s }, p0, [x10, x13]\n"
+ "mov z25.d, z29.d\n"
+ "st1b { z28.s }, p0, [x9, x13]\n"
+ "mov z0.d, z29.d\n"
+ "udot z29.s, z17.b, z2.b\n"
+ "incw x13\n"
+ "udot z25.s, z17.b, z23.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "udot z29.s, z19.b, z23.b\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
+ "udot z6.s, z17.b, z2.b\n"
+ "udot z0.s, z17.b, z23.b\n"
+ "udot z18.s, z9.b, z23.b\n"
+ "udot z25.s, z19.b, z11.b\n"
+ "udot z29.s, z5.b, z11.b\n"
"ext z11.b, z11.b, z11.b, #0x1\n"
- "mov z12.s, #0x0\n"
- "udot z26.s, z19.b, z11.b\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
- "udot z12.s, z25.b, z11.b\n"
- "udot z27.s, z30.b, z20.b\n"
- "incw x28\n"
- "whilelt p0.s, x28, %x[n_channels]\n"
- "udot z23.s, z21.b, z20.b\n"
- "ext z20.b, z20.b, z20.b, #0x1\n"
- "udot z28.s, z30.b, z11.b\n"
- "udot z26.s, z30.b, z20.b\n"
- "udot z12.s, z25.b, z20.b\n"
- "udot z27.s, z21.b, z0.b\n"
- "ext z0.b, z0.b, z0.b, #0x1\n"
- "mls z23.s, p2/M, z24.s, z8.s\n"
- "udot z28.s, z21.b, z20.b\n"
- "udot z26.s, z21.b, z0.b\n"
- ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
- "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
- "udot z12.s, z25.b, z14.b\n"
- "and z18.d, z23.d, z22.d\n"
- "mls z28.s, p2/M, z12.s, z8.s\n"
- "mls z27.s, p2/M, z13.s, z8.s\n"
+ "udot z6.s, z19.b, z23.b\n"
+ "udot z0.s, z19.b, z11.b\n"
+ "udot z18.s, z9.b, z11.b\n"
+ "udot z25.s, z5.b, z1.b\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "mls z29.s, p2/M, z22.s, z13.s\n"
+ "mov z28.s, #0x0\n"
+ "udot z6.s, z5.b, z11.b\n"
+ "udot z0.s, z5.b, z1.b\n"
+ "movprfx z11, z18\n udot z11.s, z9.b, z1.b\n"
+ "udot z18.s, z9.b, z2.b\n"
+ "udot z28.s, z9.b, z3.b\n"
+ ".inst 0x04b877bd // sqrdmulh z29.s, z29.s, z24.s\n"
+ "mls z25.s, p2/M, z10.s, z13.s\n"
+ "mls z6.s, p2/M, z18.s, z13.s\n"
+ "mov z1.s, #0x0\n"
+ "mls z0.s, p2/M, z11.s, z13.s\n"
+ "and z11.d, z29.d, z12.d\n"
+ ".inst 0x04b87739 // sqrdmulh z25.s, z25.s, z24.s\n"
+ "udot z28.s, z9.b, z16.b\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ ".inst 0x04b874c6 // sqrdmulh z6.s, z6.s, z24.s\n"
+ ".inst 0x04b87400 // sqrdmulh z0.s, z0.s, z24.s\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "and z22.d, z25.d, z12.d\n"
+ "sqadd z29.s, z29.s, z11.s\n"
+ "and z18.d, z6.d, z12.d\n"
+ "movprfx z24, z28\n udot z24.s, z9.b, z27.b\n"
+ "udot z28.s, z9.b, z8.b\n"
+ "and z11.d, z0.d, z12.d\n"
+ "asr z22.s, z22.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- "mls z26.s, p2/M, z19.s, z8.s\n"
- ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
- ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
- ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
- "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
- "sqadd z23.s, z23.s, z18.s\n"
- "and z20.d, z28.d, z22.d\n"
- ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
- "and z19.d, z27.d, z22.d\n"
- "and z18.d, z26.d, z22.d\n"
- "asr z20.s, z20.s, #0x1f\n"
+ ".inst 0x4482899d // srshl z29.s, p2/M, z29.s, z12.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z18.s\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z25.s, z25.s, z22.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "add z29.s, z29.s, z21.s\n"
+ "sqadd z0.s, z0.s, z11.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ ".inst 0x44828986 // srshl z6.s, p2/M, z6.s, z12.s\n"
+ ".inst 0x44828999 // srshl z25.s, p2/M, z25.s, z12.s\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ ".inst 0x44828980 // srshl z0.s, p2/M, z0.s, z12.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "add z6.s, z6.s, z21.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "add z0.s, z0.s, z21.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "smax z6.s, p2/M, z6.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "smax z0.s, p2/M, z0.s, z31.s\n"
+ "st1b { z29.s }, p0, [x12, x13]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "smin z6.s, p2/M, z6.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "smin z0.s, p2/M, z0.s, z15.s\n"
+ "st1b { z6.s }, p0, [x11, x13]\n"
+ "mov z11.d, z29.d\n"
+ "st1b { z25.s }, p0, [x10, x13]\n"
+ "mov z26.d, z29.d\n"
+ "st1b { z0.s }, p0, [x9, x13]\n"
+ "mov z25.d, z29.d\n"
+ "udot z29.s, z18.b, z8.b\n"
+ "incw x13\n"
+ "udot z26.s, z18.b, z3.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "whilelt p0.s, x13, %x[n_channels]\n"
+ "udot z29.s, z20.b, z3.b\n"
+ "ext z3.b, z3.b, z3.b, #0x1\n"
+ "udot z11.s, z18.b, z8.b\n"
+ "udot z25.s, z18.b, z3.b\n"
+ "udot z1.s, z9.b, z3.b\n"
+ "udot z26.s, z20.b, z16.b\n"
+ "udot z29.s, z19.b, z16.b\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "udot z11.s, z20.b, z3.b\n"
+ "udot z25.s, z20.b, z16.b\n"
+ "udot z1.s, z9.b, z16.b\n"
+ "udot z26.s, z19.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "mls z29.s, p2/M, z28.s, z13.s\n"
+ "mov z22.s, #0x0\n"
+ "udot z11.s, z19.b, z16.b\n"
+ "udot z25.s, z19.b, z27.b\n"
+ "movprfx z18, z1\n udot z18.s, z9.b, z27.b\n"
+ "udot z1.s, z9.b, z8.b\n"
+ "udot z22.s, z9.b, z30.b\n"
+ ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n"
+ "mls z26.s, p2/M, z24.s, z13.s\n"
+ "mls z11.s, p2/M, z1.s, z13.s\n"
+ "mov z10.s, #0x0\n"
+ "mls z25.s, p2/M, z18.s, z13.s\n"
+ "and z18.d, z29.d, z23.d\n"
+ ".inst 0x04a5775a // sqrdmulh z26.s, z26.s, z5.s\n"
+ "udot z22.s, z9.b, z7.b\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04a5756b // sqrdmulh z11.s, z11.s, z5.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ "ld1w { z8.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "and z19.d, z26.d, z23.d\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ "and z18.d, z11.d, z23.d\n"
+ "movprfx z6, z22\n udot z6.s, z9.b, z4.b\n"
+ "udot z22.s, z9.b, z14.b\n"
+ "and z20.d, z25.d, z23.d\n"
"asr z19.s, z19.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
- "sqadd z28.s, z28.s, z20.s\n"
- ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
- "ld1b { z13.b }, p2/Z, [%x[params]]\n"
- "sqadd z27.s, z27.s, z19.s\n"
- "sqadd z26.s, z26.s, z18.s\n"
- ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
- ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
- "add z23.s, z23.s, z16.s\n"
- "smax z23.s, p2/M, z23.s, z7.s\n"
- "add z28.s, z28.s, z16.s\n"
- "smin z23.s, p2/M, z23.s, z6.s\n"
- "add z27.s, z27.s, z16.s\n"
- "add z26.s, z26.s, z16.s\n"
- "smax z28.s, p2/M, z28.s, z7.s\n"
- "smax z27.s, p2/M, z27.s, z7.s\n"
- "mov z24.s, #0x0\n"
- "udot z24.s, z25.b, z17.b\n"
- "smax z26.s, p2/M, z26.s, z7.s\n"
- "st1b { z23.s }, p0, [x12, x28]\n"
- "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
- "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
- "smin z28.s, p2/M, z28.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
- "smin z26.s, p2/M, z26.s, z6.s\n"
- "st1b { z28.s }, p0, [x11, x28]\n"
- "mov z0.d, z1.d\n"
- "udot z24.s, z25.b, z4.b\n"
- "st1b { z27.s }, p0, [x10, x28]\n"
- "mov z31.d, z1.d\n"
- "udot z31.s, z21.b, z17.b\n"
- "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
- "st1b { z26.s }, p0, [x9, x28]\n"
- "mov z30.d, z1.d\n"
- "udot z1.s, z21.b, z29.b\n"
- "udot z1.s, z13.b, z17.b\n"
- "udot z24.s, z25.b, z29.b\n"
- "ext z29.b, z29.b, z29.b, #0x1\n"
- "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "udot z0.s, z21.b, z29.b\n"
- "ext z17.b, z17.b, z17.b, #0x1\n"
- "mov z19.s, #0x0\n"
- "udot z30.s, z21.b, z17.b\n"
- "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
- "udot z19.s, z25.b, z17.b\n"
- "udot z31.s, z13.b, z4.b\n"
- "incw x28\n"
- "whilelt p1.s, x28, %x[n_channels]\n"
- "udot z1.s, z20.b, z4.b\n"
- "ext z4.b, z4.b, z4.b, #0x1\n"
- "udot z0.s, z13.b, z17.b\n"
+ ".inst 0x44828afd // srshl z29.s, p2/M, z29.s, z23.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z18.s\n"
+ "ld1b { z24.b }, p2/Z, [%x[params]]\n"
+ "sqadd z26.s, z26.s, z19.s\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "add z29.s, z29.s, z21.s\n"
+ "sqadd z25.s, z25.s, z20.s\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ ".inst 0x44828aeb // srshl z11.s, p2/M, z11.s, z23.s\n"
+ ".inst 0x44828afa // srshl z26.s, p2/M, z26.s, z23.s\n"
+ "smax z29.s, p2/M, z29.s, z31.s\n"
+ ".inst 0x44828af9 // srshl z25.s, p2/M, z25.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "add z11.s, z11.s, z21.s\n"
+ "add z26.s, z26.s, z21.s\n"
+ "add z25.s, z25.s, z21.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "smax z11.s, p2/M, z11.s, z31.s\n"
+ "smax z26.s, p2/M, z26.s, z31.s\n"
+ "smax z25.s, p2/M, z25.s, z31.s\n"
+ "st1b { z29.s }, p0, [x12, x13]\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "smin z11.s, p2/M, z11.s, z15.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z11.s }, p0, [x11, x13]\n"
+ "mov z28.d, z2.d\n"
+ "st1b { z26.s }, p0, [x10, x13]\n"
+ "mov z1.d, z2.d\n"
+ "st1b { z25.s }, p0, [x9, x13]\n"
+ "mov z3.d, z2.d\n"
+ "udot z2.s, z18.b, z14.b\n"
+ "incw x13\n"
+ "udot z1.s, z18.b, z30.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
"whilelt p0.b, x14, %x[n_channels]\n"
- "udot z30.s, z13.b, z4.b\n"
- "udot z19.s, z25.b, z4.b\n"
- "ld1b { z13.b }, p0/Z, [x26, x14]\n"
- "ld1b { z28.b }, p0/Z, [x25, x14]\n"
- "udot z31.s, z20.b, z5.b\n"
- "ext z5.b, z5.b, z5.b, #0x1\n"
- "mls z1.s, p2/M, z24.s, z8.s\n"
- "ld1b { z27.b }, p0/Z, [x22, x14]\n"
- "udot z0.s, z20.b, z4.b\n"
- "udot z30.s, z20.b, z5.b\n"
- ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
- "ld1b { z26.b }, p0/Z, [x21, x14]\n"
- "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
- "udot z19.s, z25.b, z29.b\n"
- "and z11.d, z1.d, z22.d\n"
- "ld1b { z29.b }, p0/Z, [x23, x14]\n"
- "mls z0.s, p2/M, z19.s, z8.s\n"
- "mls z31.s, p2/M, z23.s, z8.s\n"
- "asr z11.s, z11.s, #0x1f\n"
- "ld1b { z17.b }, p0/Z, [x20, x14]\n"
- "mls z30.s, p2/M, z18.s, z8.s\n"
- ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
- ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
- ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
- "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "udot z2.s, z24.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "udot z28.s, z18.b, z14.b\n"
+ "ld1b { z0.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x14]\n"
+ "udot z3.s, z18.b, z30.b\n"
+ "udot z10.s, z9.b, z30.b\n"
+ "udot z1.s, z24.b, z7.b\n"
+ "udot z2.s, z19.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ "udot z28.s, z24.b, z30.b\n"
+ "ld1b { z30.b }, p0/Z, [x20, x14]\n"
+ "udot z3.s, z24.b, z7.b\n"
+ "udot z10.s, z9.b, z7.b\n"
+ "udot z1.s, z19.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "mls z2.s, p2/M, z22.s, z13.s\n"
+ "udot z28.s, z19.b, z7.b\n"
+ "udot z3.s, z19.b, z4.b\n"
+ "movprfx z18, z10\n udot z18.s, z9.b, z4.b\n"
+ "udot z10.s, z9.b, z14.b\n"
+ "ld1b { z14.b }, p0/Z, [x25, x14]\n"
+ "mls z1.s, p2/M, z6.s, z13.s\n"
+ ".inst 0x04a87442 // sqrdmulh z2.s, z2.s, z8.s\n"
+ "mls z3.s, p2/M, z18.s, z13.s\n"
+ "and z18.d, z2.d, z23.d\n"
+ "mls z28.s, p2/M, z10.s, z13.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04a87421 // sqrdmulh z1.s, z1.s, z8.s\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a87463 // sqrdmulh z3.s, z3.s, z8.s\n"
+ "ld1b { z12.b }, p0/Z, [x28, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x40]\n"
- "sqadd z1.s, z1.s, z11.s\n"
- "and z21.d, z0.d, z22.d\n"
- ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
"ldp x21, x20, [%x[inptrs], #0x50]\n"
- "and z20.d, z31.d, z22.d\n"
- "and z19.d, z30.d, z22.d\n"
- "ld1b { z18.b }, p0/Z, [x23, x14]\n"
- "ld1b { z11.b }, p0/Z, [x22, x14]\n"
- "asr z21.s, z21.s, #0x1f\n"
- "asr z20.s, z20.s, #0x1f\n"
- "ld1b { z24.b }, p0/Z, [x21, x14]\n"
- "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "sqadd z2.s, z2.s, z18.s\n"
+ "and z22.d, z1.d, z23.d\n"
+ "and z18.d, z28.d, z23.d\n"
+ "and z19.d, z3.d, z23.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "asr z22.s, z22.s, #0x1f\n"
+ ".inst 0x44828ae2 // srshl z2.s, p2/M, z2.s, z23.s\n"
+ "ld1b { z11.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z7.b }, p0/Z, [x20, x14]\n"
+ "asr z18.s, z18.s, #0x1f\n"
"asr z19.s, z19.s, #0x1f\n"
- "sqadd z0.s, z0.s, z21.s\n"
- ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
- "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
- "sqadd z31.s, z31.s, z20.s\n"
- "sqadd z30.s, z30.s, z19.s\n"
- ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
- ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
- "add z1.s, z1.s, z16.s\n"
- "smax z1.s, p2/M, z1.s, z7.s\n"
- "add z0.s, z0.s, z16.s\n"
- "ld1b { z9.b }, p0/Z, [x24, x14]\n"
- "add z31.s, z31.s, z16.s\n"
- "add z30.s, z30.s, z16.s\n"
+ "sqadd z1.s, z1.s, z22.s\n"
+ "ld1b { z10.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "add z2.s, z2.s, z21.s\n"
+ "sqadd z28.s, z28.s, z18.s\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z3.s, z3.s, z19.s\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n"
+ "smax z2.s, p2/M, z2.s, z31.s\n"
+ ".inst 0x44828afc // srshl z28.s, p2/M, z28.s, z23.s\n"
+ ".inst 0x44828ae3 // srshl z3.s, p2/M, z3.s, z23.s\n"
+ "ld1b { z5.b }, p0/Z, [x24, x14]\n"
"ldp x23, x22, [%x[inptrs], #0x60]\n"
"ldp x21, x20, [%x[inptrs], #0x70]\n"
- "smin z1.s, p2/M, z1.s, z6.s\n"
- "smax z0.s, p2/M, z0.s, z7.s\n"
- "st1b { z1.s }, p1, [x12, x28]\n"
- "ld1b { z2.b }, p0/Z, [x23, x14]\n"
- "smax z31.s, p2/M, z31.s, z7.s\n"
- "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ldp x28, x27, [%x[inptrs], #0x0]\n"
+ "add z1.s, z1.s, z21.s\n"
+ "smin z2.s, p2/M, z2.s, z15.s\n"
+ "ldp x26, x25, [%x[inptrs], #0x10]\n"
+ "add z28.s, z28.s, z21.s\n"
+ "add z3.s, z3.s, z21.s\n"
+ "ld1b { z6.b }, p0/Z, [x23, x14]\n"
"ld1b { z23.b }, p0/Z, [x22, x14]\n"
- "ld1b { z22.b }, p0/Z, [x21, x14]\n"
- "ld1b { z5.b }, p0/Z, [x20, x14]\n"
- "zip2 z20.b, z15.b, z28.b\n"
- "zip1 z15.b, z15.b, z28.b\n"
- "smin z0.s, p2/M, z0.s, z6.s\n"
- "zip1 z19.b, z13.b, z29.b\n"
- "zip2 z29.b, z13.b, z29.b\n"
- "smin z31.s, p2/M, z31.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
- "st1b { z0.s }, p1, [x11, x28]\n"
- "zip2 z13.b, z15.b, z19.b\n"
- "zip1 z15.b, z15.b, z19.b\n"
- "ldp x27, x26, [%x[inptrs], #0x0]\n"
- "st1b { z31.s }, p1, [x10, x28]\n"
- "zip1 z14.b, z20.b, z29.b\n"
- "zip2 z29.b, z20.b, z29.b\n"
- "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
- "st1b { z30.s }, p1, [x9, x28]\n"
- "zip2 z21.b, z9.b, z26.b\n"
- "zip1 z9.b, z9.b, z26.b\n"
- "incw x28\n"
- "zip1 z20.b, z27.b, z17.b\n"
- "zip2 z17.b, z27.b, z17.b\n"
- "ldp x25, x23, [%x[inptrs], #0x10]\n"
"ldp x24, x22, [%x[inptrs], #0x20]\n"
- "zip2 z31.b, z18.b, z24.b\n"
- "zip1 z18.b, z18.b, z24.b\n"
+ "smax z1.s, p2/M, z1.s, z31.s\n"
+ "st1b { z2.s }, p1, [x12, x13]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "zip2 z19.b, z12.b, z27.b\n"
+ "zip1 z12.b, z12.b, z27.b\n"
+ "smax z28.s, p2/M, z28.s, z31.s\n"
+ "smax z3.s, p2/M, z3.s, z31.s\n"
+ "zip1 z18.b, z0.b, z14.b\n"
+ "zip2 z14.b, z0.b, z14.b\n"
+ "smin z1.s, p2/M, z1.s, z15.s\n"
"ldp x21, x20, [%x[inptrs], #0x30]\n"
- "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
- "zip1 z27.b, z11.b, z4.b\n"
- "zip2 z4.b, z11.b, z4.b\n"
- "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z3.s, p2/M, z3.s, z15.s\n"
+ "zip2 z2.b, z12.b, z18.b\n"
+ "zip1 z12.b, z12.b, z18.b\n"
+ "zip1 z8.b, z19.b, z14.b\n"
+ "zip2 z14.b, z19.b, z14.b\n"
+ "ld1w { z0.s }, p2/Z, [%x[params], #4, MUL VL]\n"
"addvl %x[params], %x[params], #8\n"
- "zip2 z30.b, z2.b, z22.b\n"
- "zip1 z2.b, z2.b, z22.b\n"
- "zip1 z28.b, z23.b, z5.b\n"
- "zip2 z5.b, z23.b, z5.b\n"
- "zip2 z19.b, z9.b, z20.b\n"
- "zip1 z9.b, z9.b, z20.b\n"
- "zip1 z11.b, z21.b, z17.b\n"
- "zip2 z17.b, z21.b, z17.b\n"
- "zip2 z12.b, z18.b, z27.b\n"
- "zip1 z18.b, z18.b, z27.b\n"
- "zip1 z20.b, z31.b, z4.b\n"
- "zip2 z4.b, z31.b, z4.b\n"
- "zip2 z24.b, z2.b, z28.b\n"
- "zip1 z2.b, z2.b, z28.b\n"
- "zip1 z0.b, z30.b, z5.b\n"
- "zip2 z5.b, z30.b, z5.b\n"
- "mov z22.d, z10.d\n"
- "mov z31.d, z10.d\n"
- "mov z21.d, z10.d\n"
+ "st1b { z28.s }, p1, [x11, x13]\n"
+ "zip2 z27.b, z5.b, z25.b\n"
+ "zip1 z5.b, z5.b, z25.b\n"
+ "st1b { z1.s }, p1, [x10, x13]\n"
+ "zip1 z18.b, z26.b, z30.b\n"
+ "zip2 z30.b, z26.b, z30.b\n"
+ "st1b { z3.s }, p1, [x9, x13]\n"
+ "zip2 z19.b, z29.b, z11.b\n"
+ "zip1 z29.b, z29.b, z11.b\n"
+ "incw x13\n"
+ "zip1 z28.b, z24.b, z7.b\n"
+ "zip2 z7.b, z24.b, z7.b\n"
+ "zip2 z25.b, z6.b, z22.b\n"
+ "zip1 z6.b, z6.b, z22.b\n"
+ "zip1 z22.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "zip2 z23.b, z5.b, z18.b\n"
+ "zip1 z5.b, z5.b, z18.b\n"
+ "zip1 z3.b, z27.b, z30.b\n"
+ "zip2 z30.b, z27.b, z30.b\n"
+ "zip2 z11.b, z29.b, z28.b\n"
+ "zip1 z29.b, z29.b, z28.b\n"
+ "zip1 z16.b, z19.b, z7.b\n"
+ "zip2 z7.b, z19.b, z7.b\n"
+ "zip2 z1.b, z6.b, z22.b\n"
+ "zip1 z6.b, z6.b, z22.b\n"
+ "zip1 z27.b, z25.b, z4.b\n"
+ "zip2 z4.b, z25.b, z4.b\n"
+ "mov z26.d, z0.d\n"
+ "mov z25.d, z0.d\n"
+ "mov z28.d, z0.d\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 5c26010c0d..5e32044434 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,316 +91,316 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x16, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x17, #0x0\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x16\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z12.b }, p4/Z, [x21]\n"
- "ld1rb { z30.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z24.h }, p4/Z, [x22]\n"
- "ld1rh { z11.h }, p4/Z, [x21]\n"
- "ld1rh { z26.h }, p4/Z, [x20]\n"
- "ldp x13, x12, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x16, x15\n"
- "ldp x11, x10, [x24, #0x10]\n"
- "whilelt p2.s, x16, x15\n"
- "whilelt p1.s, x23, x15\n"
- "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z14.h }, p4/Z, [x14]\n"
- "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "add x28, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x27, #0x0\n"
- "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x12, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x17\n"
+ "add x20, x26, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x26, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x26, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x26, %[offsetof_Requantize32_minval]\n"
+ "add x20, x26, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z15.b }, p4/Z, [x23]\n"
+ "ld1rh { z26.h }, p4/Z, [x22]\n"
+ "ld1rh { z2.h }, p4/Z, [x21]\n"
+ "ld1rh { z14.h }, p4/Z, [x20]\n"
+ "incw x24\n"
+ "whilelt p3.h, x17, x15\n"
+ "ldp x9, x28, [x16, #0x0]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "whilelt p2.s, x17, x15\n"
+ "whilelt p1.s, x24, x15\n"
+ "ld1b { z13.h }, p4/Z, [x14]\n"
+ "ld1b { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
- "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
- "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1b { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
- "addvl x9, x9, #2\n"
- "mov z17.d, z5.d\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z25.d, z9.d\n"
- "mov z16.d, z5.d\n"
- "ld1b { z0.h }, p3/Z, [x24, x16]\n"
- "ld1b { z29.h }, p3/Z, [x23, x16]\n"
- "mov z23.d, z9.d\n"
- "mov z22.d, z5.d\n"
- "ld1b { z4.h }, p3/Z, [x22, x16]\n"
- "ld1b { z13.h }, p3/Z, [x21, x16]\n"
- "mov z27.d, z9.d\n"
- ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
- "ld1b { z20.h }, p3/Z, [x20, x16]\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x9, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454f19ad // usublb z13.h, z13.b, z15.b\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "ld1w { z24.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454f196b // usublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1a52 // usublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
+ "ld1b { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1a94 // usublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z19.s, z24.s\n"
+ "uzp2 z16.s, z19.s, z24.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1a31 // usublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a1b7b // usublb z27.h, z27.b, z10.b\n"
"1:" // Loop
- ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
- "ldr x20, [x28, #0x28]\n"
- "ldr x21, [x28, #0x38]\n"
- ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
- ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
- "ld1b { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x30]\n"
- ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
- ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
- "ld1b { z31.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
- ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
- "ldr x21, [x28, #0x40]\n"
- "ld1b { z15.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- "ldr x20, [x28, #0x48]\n"
- ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
- ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
- "ld1b { z19.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
- ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
- ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
- ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
- ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
- "ldr x21, [x28, #0x50]\n"
- "ldr x20, [x28, #0x58]\n"
- ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
- ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- "ld1b { z4.h }, p3/Z, [x21, x16]\n"
- ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
- ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x16]\n"
- "ldr x21, [x28, #0x60]\n"
- ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
- ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
- "ldr x20, [x28, #0x68]\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
- ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- "ld1b { z0.h }, p3/Z, [x21, x16]\n"
- ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
- ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
- "ld1b { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x70]\n"
- ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
- ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- "ld1b { z13.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
- ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ldr x20, [x28, #0x78]\n"
- ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
- ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- "whilelt p0.h, x27, x15\n"
- ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
- ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
- "ld1w { z20.s }, p2/Z, [x26]\n"
+ ".inst 0x449440e3 // smlalb z3.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x449444f0 // smlalt z16.s, p4/M, z7.h, z20.h\n"
+ "ldr x25, [x13, #0x28]\n"
+ "ldr x24, [x13, #0x38]\n"
+ ".inst 0x448640e8 // smlalb z8.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x448b40e0 // smlalb z0.s, p4/M, z7.h, z11.h\n"
+ "ldr x23, [x13, #0x30]\n"
+ "ldr x22, [x13, #0x40]\n"
+ ".inst 0x448d40f3 // smlalb z19.s, p4/M, z7.h, z13.h\n"
+ ".inst 0x448644f5 // smlalt z21.s, p4/M, z7.h, z6.h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "ld1b { z22.h }, p3/Z, [x25, x17]\n"
+ ".inst 0x448b44fd // smlalt z29.s, p4/M, z7.h, z11.h\n"
+ ".inst 0x448d44e9 // smlalt z9.s, p4/M, z7.h, z13.h\n"
+ "ld1b { z31.h }, p3/Z, [x24, x17]\n"
+ ".inst 0x448d4303 // smlalb z3.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z25.h }, p3/Z, [x22, x17]\n"
+ ".inst 0x44924088 // smlalb z8.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x44924020 // smlalb z0.s, p4/M, z1.h, z18.h\n"
+ "ld1b { z23.h }, p3/Z, [x20, x17]\n"
+ "ldr x20, [x13, #0x58]\n"
+ ".inst 0x448b4033 // smlalb z19.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44924495 // smlalt z21.s, p4/M, z4.h, z18.h\n"
+ "ld1b { z12.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x4492443d // smlalt z29.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448b4429 // smlalt z9.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a1bff // usublb z31.h, z31.b, z10.b\n"
+ "ldr x21, [x13, #0x60]\n"
+ ".inst 0x449e4023 // smlalb z3.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x449e4430 // smlalt z16.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ "ld1b { z4.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x44944028 // smlalb z8.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x449c42c0 // smlalb z0.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ "ldr x20, [x13, #0x68]\n"
+ ".inst 0x44864373 // smlalb z19.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44944435 // smlalt z21.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x454a1af7 // usublb z23.h, z23.b, z10.b\n"
+ "ld1b { z7.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x449c46dd // smlalt z29.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x454a198c // usublb z12.h, z12.b, z10.b\n"
+ "ldr x21, [x13, #0x70]\n"
+ ".inst 0x44914363 // smlalb z3.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x44914770 // smlalt z16.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x449c4368 // smlalb z8.s, p4/M, z27.h, z28.h\n"
+ ".inst 0x44944360 // smlalb z0.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ "ldr x20, [x13, #0x78]\n"
+ ".inst 0x44854313 // smlalb z19.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x449c4775 // smlalt z21.s, p4/M, z27.h, z28.h\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "whilelt p0.h, x12, x15\n"
+ ".inst 0x4494477d // smlalt z29.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x44854709 // smlalt z9.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
+ ".inst 0x448b43e3 // smlalb z3.s, p4/M, z31.h, z11.h\n"
+ ".inst 0x448b47f0 // smlalt z16.s, p4/M, z31.h, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x11, #1, MUL VL]\n"
"inch x14\n"
- ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
- ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
- "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x448d43e8 // smlalb z8.s, p4/M, z31.h, z13.h\n"
+ ".inst 0x449e42e0 // smlalb z0.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
"ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
- ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
- "addvl x26, x26, #2\n"
- ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
- ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
- ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
- ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
- "uzp1 z2.s, z20.s, z15.s\n"
- "inch x16\n"
- ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
- ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
- "uzp2 z15.s, z20.s, z15.s\n"
- "ld1w { z20.s }, p2/Z, [x25]\n"
- ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
- "mov x20, x16\n"
+ ".inst 0x449442f3 // smlalb z19.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x448d47f5 // smlalt z21.s, p4/M, z31.h, z13.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x17]\n"
+ "inch x17\n"
+ ".inst 0x449e46fd // smlalt z29.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x449446e9 // smlalt z9.s, p4/M, z23.h, z20.h\n"
+ "uzp1 z20.s, z24.s, z27.s\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x44924323 // smlalb z3.s, p4/M, z25.h, z18.h\n"
+ ".inst 0x44924730 // smlalt z16.s, p4/M, z25.h, z18.h\n"
+ "uzp2 z24.s, z24.s, z27.s\n"
+ "ld1w { z27.s }, p2/Z, [x10]\n"
+ ".inst 0x448b4328 // smlalb z8.s, p4/M, z25.h, z11.h\n"
+ ".inst 0x448d4180 // smlalb z0.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x454a1bff // usublb z31.h, z31.b, z10.b\n"
+ "mov x20, x17\n"
+ ".inst 0x44924093 // smlalb z19.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x448b4735 // smlalt z21.s, p4/M, z25.h, z11.h\n"
+ "ld1w { z25.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "whilelt p2.s, x17, x15\n"
+ ".inst 0x448d459d // smlalt z29.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x44924489 // smlalt z9.s, p4/M, z4.h, z18.h\n"
+ "addvl x10, x10, #2\n"
+ ".inst 0x448542e3 // smlalb z3.s, p4/M, z23.h, z5.h\n"
+ ".inst 0x448546f0 // smlalt z16.s, p4/M, z23.h, z5.h\n"
"incw x20\n"
- ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
- ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
- "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
- "uzp1 z21.s, z20.s, z19.s\n"
- ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
- "uzp2 z1.s, z20.s, z19.s\n"
- "whilelt p2.s, x16, x15\n"
- ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449142e8 // smlalb z8.s, p4/M, z23.h, z17.h\n"
+ ".inst 0x448640e0 // smlalb z0.s, p4/M, z7.h, z6.h\n"
+ "uzp1 z11.s, z27.s, z25.s\n"
+ ".inst 0x449e42d3 // smlalb z19.s, p4/M, z22.h, z30.h\n"
+ ".inst 0x449146f5 // smlalt z21.s, p4/M, z23.h, z17.h\n"
+ "uzp2 z27.s, z27.s, z25.s\n"
+ ".inst 0x448644fd // smlalt z29.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x449e46c9 // smlalt z9.s, p4/M, z22.h, z30.h\n"
"whilelt p1.s, x20, x15\n"
- "whilelt p3.h, x16, x15\n"
- ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
- ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
- ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
- "addvl x25, x25, #2\n"
- ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
- ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
- "and z19.d, z5.d, z21.d\n"
- ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
- ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
- "sqadd z5.s, z5.s, z19.s\n"
- ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
- ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
- ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
- ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
- ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
- "and z29.d, z9.d, z1.d\n"
- ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
- ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
- ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
- ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
- ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
- ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
- ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
- ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
- "asr z29.s, z29.s, #0x1f\n"
- "and z18.d, z17.d, z21.d\n"
- ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
- "and z20.d, z16.d, z21.d\n"
- ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
- "and z19.d, z22.d, z21.d\n"
- ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
- "sqadd z9.s, z9.s, z29.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z7.d, z25.d, z1.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z6.d, z23.d, z1.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z27.d, z1.d\n"
- "sqadd z17.s, z17.s, z18.s\n"
- "asr z7.s, z7.s, #0x1f\n"
- ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
- "sqadd z16.s, z16.s, z20.s\n"
+ "whilelt p3.h, x17, x15\n"
+ ".inst 0x44864183 // smlalb z3.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x44864590 // smlalt z16.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x449e4088 // smlalb z8.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x44914020 // smlalb z0.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4033 // smlalb z19.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449e4495 // smlalt z21.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x4491443d // smlalt z29.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4429 // smlalt z9.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c40e3 // smlalb z3.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x449c44f0 // smlalt z16.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x448542c8 // smlalb z8.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448543e0 // smlalb z0.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449143f3 // smlalb z19.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x448546d5 // smlalt z21.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448547fd // smlalt z29.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449147e9 // smlalt z9.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x04b47463 // sqrdmulh z3.s, z3.s, z20.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ ".inst 0x04b47400 // sqrdmulh z0.s, z0.s, z20.s\n"
+ "and z4.d, z3.d, z11.d\n"
+ ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ "and z13.d, z16.d, z27.d\n"
+ "and z6.d, z8.d, z11.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z7.d, z0.d, z11.d\n"
+ ".inst 0x04b877bd // sqrdmulh z29.s, z29.s, z24.s\n"
+ ".inst 0x04b87529 // sqrdmulh z9.s, z9.s, z24.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
- ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
- "sqadd z22.s, z22.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
- "sqadd z25.s, z25.s, z7.s\n"
- "sqadd z23.s, z23.s, z6.s\n"
- ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
- "sqadd z27.s, z27.s, z2.s\n"
- ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
- ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
- ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
- ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
- ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
- ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
- ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
- ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
- "sqadd z5.h, z5.h, z24.h\n"
- "smax z5.h, p4/M, z5.h, z11.h\n"
- "smin z5.h, p4/M, z5.h, z26.h\n"
- "sqadd z17.h, z17.h, z24.h\n"
- "sqadd z16.h, z16.h, z24.h\n"
- "smax z17.h, p4/M, z17.h, z11.h\n"
- "smax z16.h, p4/M, z16.h, z11.h\n"
- "sqadd z22.h, z22.h, z24.h\n"
- "smax z22.h, p4/M, z22.h, z11.h\n"
- "smin z17.h, p4/M, z17.h, z26.h\n"
- "st1b { z5.h }, p0, [x13, x27]\n"
- "smin z16.h, p4/M, z16.h, z26.h\n"
- "smin z22.h, p4/M, z22.h, z26.h\n"
- "st1b { z17.h }, p0, [x12, x27]\n"
- "st1b { z16.h }, p0, [x11, x27]\n"
- "st1b { z22.h }, p0, [x10, x27]\n"
- "ld1b { z14.h }, p4/Z, [x14]\n"
- "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "inch x27\n"
- "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "sqadd z3.s, z3.s, z4.s\n"
+ "and z20.d, z19.d, z11.d\n"
+ "and z18.d, z21.d, z27.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z13.s\n"
+ "and z13.d, z29.d, z27.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z23.d, z9.d, z27.d\n"
+ ".inst 0x44829163 // srshl z3.s, p4/M, z3.s, z11.s\n"
+ "sqadd z8.s, z8.s, z6.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z7.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ ".inst 0x44829168 // srshl z8.s, p4/M, z8.s, z11.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ ".inst 0x45304063 // sqxtnb z3.h, z3.s\n"
+ ".inst 0x44829160 // srshl z0.s, p4/M, z0.s, z11.s\n"
+ "sqadd z29.s, z29.s, z13.s\n"
+ ".inst 0x44829173 // srshl z19.s, p4/M, z19.s, z11.s\n"
+ "sqadd z9.s, z9.s, z23.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x44829375 // srshl z21.s, p4/M, z21.s, z27.s\n"
+ ".inst 0x45304000 // sqxtnb z0.h, z0.s\n"
+ ".inst 0x45304603 // sqxtnt z3.h, z16.s\n"
+ ".inst 0x4482937d // srshl z29.s, p4/M, z29.s, z27.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x45304273 // sqxtnb z19.h, z19.s\n"
+ ".inst 0x453046a8 // sqxtnt z8.h, z21.s\n"
+ ".inst 0x453047a0 // sqxtnt z0.h, z29.s\n"
+ ".inst 0x45304533 // sqxtnt z19.h, z9.s\n"
+ "sqadd z3.h, z3.h, z26.h\n"
+ "sqadd z8.h, z8.h, z26.h\n"
+ "sqadd z0.h, z0.h, z26.h\n"
+ "sqadd z19.h, z19.h, z26.h\n"
+ "smax z3.h, p4/M, z3.h, z2.h\n"
+ "smax z8.h, p4/M, z8.h, z2.h\n"
+ "smax z0.h, p4/M, z0.h, z2.h\n"
+ "smax z19.h, p4/M, z19.h, z2.h\n"
+ "smin z3.h, p4/M, z3.h, z14.h\n"
+ "smin z8.h, p4/M, z8.h, z14.h\n"
+ "smin z0.h, p4/M, z0.h, z14.h\n"
+ "smin z19.h, p4/M, z19.h, z14.h\n"
+ "st1b { z3.h }, p0, [x9, x12]\n"
+ "st1b { z8.h }, p0, [x28, x12]\n"
+ "st1b { z0.h }, p0, [x27, x12]\n"
+ "st1b { z19.h }, p0, [x26, x12]\n"
+ "inch x12\n"
+ "ld1b { z13.h }, p4/Z, [x14]\n"
+ "ld1b { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
- "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
- "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1b { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1b { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
+ ".inst 0x454f19ad // usublb z13.h, z13.b, z15.b\n"
+ "ld1w { z1.s }, p2/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
+ ".inst 0x454f196b // usublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1a52 // usublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f18c6 // usublb z6.h, z6.b, z15.b\n"
+ "ld1b { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1a94 // usublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f1bde // usublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z1.s, z0.s\n"
+ "uzp2 z16.s, z1.s, z0.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z17.d, z5.d\n"
- "mov z25.d, z9.d\n"
- "ld1b { z0.h }, p3/Z, [x24, x16]\n"
- "ld1b { z29.h }, p3/Z, [x23, x16]\n"
- "mov z16.d, z5.d\n"
- "mov z23.d, z9.d\n"
- "ld1b { z4.h }, p3/Z, [x22, x16]\n"
- "ld1b { z13.h }, p3/Z, [x21, x16]\n"
- "mov z22.d, z5.d\n"
- "mov z27.d, z9.d\n"
- "ld1b { z20.h }, p3/Z, [x20, x16]\n"
- ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
- ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f1b9c // usublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1a31 // usublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f18a5 // usublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a1b7b // usublb z27.h, z27.b, z10.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 1ea2fcbfbd..d439d05a60 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,348 +100,348 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x7, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x8, #0x0\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x7\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z26.b }, p4/Z, [x21]\n"
- "ld1rb { z13.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z19.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x8\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z25.h }, p4/Z, [x22]\n"
+ "ld1rh { z14.h }, p4/Z, [x21]\n"
"ld1rh { z9.h }, p4/Z, [x20]\n"
- "ldp x16, x15, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x7, x8\n"
- "ldp x14, x13, [x24, #0x10]\n"
- "whilelt p2.s, x7, x8\n"
- "whilelt p1.s, x23, x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z25.h }, p4/Z, [x17]\n"
- "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
- "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x12]\n"
- "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1b { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "mov z18.d, z8.d\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z0.d, z24.d\n"
- "mov z15.d, z8.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z21.h }, p3/Z, [x27, x7]\n"
- "mov z1.d, z24.d\n"
- "mov z5.d, z8.d\n"
- "ld1b { z22.h }, p3/Z, [x26, x7]\n"
- "ld1b { z11.h }, p3/Z, [x25, x7]\n"
- "mov z6.d, z24.d\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- "ld1b { z20.h }, p3/Z, [x24, x7]\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
- ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- "ld1b { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "incw x24\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x11, x10, [x26, #0x0]\n"
+ "ldp x9, x28, [x26, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x24, x17\n"
+ "ld1b { z28.h }, p4/Z, [x16]\n"
+ "ld1b { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1b { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1b { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1b { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1w { z11.s }, p2/Z, [x25]\n"
+ "ld1w { z4.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a52 // usublb z18.h, z18.b, z12.b\n"
+ "ld1b { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z11.s, z4.s\n"
+ "uzp2 z11.s, z11.s, z4.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c1b5a // usublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c1ab5 // usublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1b { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1b { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1b { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511a73 // usublb z19.h, z19.b, z17.b\n"
"1:" // Loop
- ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
- "ldr x21, [x11, #0x58]\n"
- "ldr x20, [x11, #0x78]\n"
- ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
- ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
- "ld1b { z17.h }, p3/Z, [x21, x7]\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
- "ldr x21, [x11, #0x60]\n"
- "ldr x20, [x11, #0x80]\n"
- ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
- ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
- ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
- ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
- ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
- "ld1b { z22.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
- ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
- ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
- "ldr x21, [x11, #0x68]\n"
- ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
- "ld1b { z21.h }, p3/Z, [x20, x7]\n"
- "ldr x20, [x11, #0x88]\n"
- ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
- ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
- ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- "ldr x22, [x11, #0x40]\n"
- ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- "ld1b { z11.h }, p3/Z, [x21, x7]\n"
- ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
- "ld1b { z20.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
- "ldr x20, [x11, #0x98]\n"
- ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
- "ldr x23, [x11, #0x50]\n"
- ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- "ld1b { z17.h }, p3/Z, [x22, x7]\n"
- ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x7]\n"
- "ld1b { z28.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
- "ldr x22, [x11, #0x48]\n"
- ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
- "ldr x21, [x11, #0x90]\n"
- "ldr x20, [x11, #0xa8]\n"
- ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
- ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
- ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
- ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
- ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
- ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
- "ld1b { z16.h }, p3/Z, [x22, x7]\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
- "ld1b { z11.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
- "ldr x21, [x11, #0xa0]\n"
- "ldr x20, [x11, #0xb0]\n"
- ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
- ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
- ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
- "ld1b { z20.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
- ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
- "ldr x20, [x11, #0xb8]\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
- ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
- "ld1b { z30.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
- ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
- ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
- "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
- ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
- ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
- ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x7]\n"
- "uzp1 z10.s, z17.s, z14.s\n"
- ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
- "uzp2 z14.s, z17.s, z14.s\n"
- "ld1w { z17.s }, p2/Z, [x28]\n"
- ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
- ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
- "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
- ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
- ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
- ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
- "uzp1 z4.s, z17.s, z16.s\n"
- "inch x7\n"
- ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
- ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
- ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
- ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
- "uzp2 z22.s, z17.s, z16.s\n"
- "mov x20, x7\n"
- ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
- ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
- "and z17.d, z8.d, z4.d\n"
- "inch x17\n"
- ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
- ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
- ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
- "incw x20\n"
- ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
- ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- "whilelt p2.s, x7, x8\n"
- ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
- "and z16.d, z24.d, z22.d\n"
- "whilelt p1.s, x20, x8\n"
- ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ ".inst 0x448f4065 // smlalb z5.s, p4/M, z3.h, z15.h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x78]\n"
+ ".inst 0x448f446b // smlalt z11.s, p4/M, z3.h, z15.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x80]\n"
+ ".inst 0x449a407e // smlalb z30.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448d4064 // smlalb z4.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c407f // smlalb z31.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449a4470 // smlalt z16.s, p4/M, z3.h, z26.h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "ld1b { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d4468 // smlalt z8.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c446a // smlalt z10.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c43a5 // smlalb z5.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x449c47ab // smlalt z11.s, p4/M, z29.h, z28.h\n"
+ "ld1b { z29.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z3.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x4494401e // smlalb z30.s, p4/M, z0.h, z20.h\n"
+ "ldr x25, [x15, #0x40]\n"
+ "ldr x24, [x15, #0x70]\n"
+ "whilelt p0.h, x14, x17\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x455118e7 // usublb z7.h, z7.b, z17.b\n"
+ ".inst 0x44944410 // smlalt z16.s, p4/M, z0.h, z20.h\n"
+ "ld1b { z0.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ "ldr x23, [x15, #0x98]\n"
+ "ldr x22, [x15, #0x50]\n"
+ ".inst 0x449442e5 // smlalb z5.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x449446eb // smlalt z11.s, p4/M, z23.h, z20.h\n"
+ "ld1b { z23.h }, p3/Z, [x20, x8]\n"
+ "ldr x21, [x15, #0x48]\n"
+ ".inst 0x44924024 // smlalb z4.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448640ff // smlalb z31.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ "ldr x20, [x15, #0x90]\n"
+ ".inst 0x44924428 // smlalt z8.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448644ea // smlalt z10.s, p4/M, z7.h, z6.h\n"
+ "ld1b { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d431e // smlalb z30.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x449242c5 // smlalb z5.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246cb // smlalt z11.s, p4/M, z22.h, z18.h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x449c43a4 // smlalb z4.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494407f // smlalb z31.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ "ldr x23, [x15, #0xa0]\n"
+ ".inst 0x449c47a8 // smlalt z8.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494446a // smlalt z10.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x455118e7 // usublb z7.h, z7.b, z17.b\n"
+ "ldr x22, [x15, #0xb0]\n"
+ ".inst 0x449c427e // smlalb z30.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x449c4670 // smlalt z16.s, p4/M, z19.h, z28.h\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44864365 // smlalb z5.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476b // smlalt z11.s, p4/M, z27.h, z6.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x44864004 // smlalb z4.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x448242ff // smlalb z31.s, p4/M, z23.h, z2.h\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ ".inst 0x44864408 // smlalt z8.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ ".inst 0x4486403e // smlalb z30.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ "ld1b { z23.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x44864430 // smlalt z16.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x448d4265 // smlalb z5.s, p4/M, z19.h, z13.h\n"
+ ".inst 0x448d466b // smlalt z11.s, p4/M, z19.h, z13.h\n"
+ "ld1b { z6.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z1.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x449440e4 // smlalb z4.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d431f // smlalb z31.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ "ld1w { z19.s }, p2/Z, [x13]\n"
+ ".inst 0x449444e8 // smlalt z8.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d470a // smlalt z10.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ "ld1w { z20.s }, p1/Z, [x13, #1, MUL VL]\n"
+ ".inst 0x4482439e // smlalb z30.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x455118c6 // usublb z6.h, z6.b, z17.b\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ld1b { z13.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448242c5 // smlalb z5.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x448246cb // smlalt z11.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ "inch x8\n"
+ ".inst 0x449a4364 // smlalb z4.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492401f // smlalb z31.s, p4/M, z0.h, z18.h\n"
+ "uzp1 z28.s, z19.s, z20.s\n"
+ "inch x16\n"
+ ".inst 0x449a4768 // smlalt z8.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492440a // smlalt z10.s, p4/M, z0.h, z18.h\n"
+ "uzp2 z20.s, z19.s, z20.s\n"
+ "ld1w { z27.s }, p2/Z, [x12]\n"
+ ".inst 0x449242de // smlalb z30.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246d0 // smlalt z16.s, p4/M, z22.h, z18.h\n"
+ "ld1w { z19.s }, p1/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x455119ad // usublb z13.h, z13.b, z17.b\n"
+ ".inst 0x449a43a5 // smlalb z5.s, p4/M, z29.h, z26.h\n"
+ ".inst 0x449a47ab // smlalt z11.s, p4/M, z29.h, z26.h\n"
+ "mov x21, x8\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x449542e4 // smlalb z4.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449540df // smlalb z31.s, p4/M, z6.h, z21.h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
- "whilelt p3.h, x7, x8\n"
- "addvl x9, x9, #2\n"
- ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
- "sqadd z8.s, z8.s, z17.s\n"
- ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
- "addvl x28, x28, #2\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z21.d, z18.d, z4.d\n"
- ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
- "and z20.d, z15.d, z4.d\n"
- ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
- "and z28.d, z5.d, z4.d\n"
- ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z25.d, z0.d, z22.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z17.d, z1.d, z22.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "and z16.d, z6.d, z22.d\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "asr z25.s, z25.s, #0x1f\n"
- ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
- "sqadd z15.s, z15.s, z20.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
- "sqadd z5.s, z5.s, z28.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
- "sqadd z0.s, z0.s, z25.s\n"
- "sqadd z1.s, z1.s, z17.s\n"
- ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
- "sqadd z6.s, z6.s, z16.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
- ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
- ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
- ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ "addvl x13, x13, #2\n"
+ ".inst 0x449546e8 // smlalt z8.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449544ca // smlalt z10.s, p4/M, z6.h, z21.h\n"
+ "uzp1 z23.s, z27.s, z19.s\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x4495407e // smlalb z30.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44954470 // smlalt z16.s, p4/M, z3.h, z21.h\n"
+ "uzp2 z6.s, z27.s, z19.s\n"
+ "incw x21\n"
+ ".inst 0x449540e5 // smlalb z5.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x449544eb // smlalt z11.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x44824004 // smlalb z4.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a403f // smlalb z31.s, p4/M, z1.h, z26.h\n"
+ ".inst 0x44824408 // smlalt z8.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a442a // smlalt z10.s, p4/M, z1.h, z26.h\n"
+ "whilelt p1.s, x21, x17\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x448f431e // smlalb z30.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x448f4710 // smlalt z16.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x04bc74a5 // sqrdmulh z5.s, z5.s, z28.s\n"
+ ".inst 0x04b4756b // sqrdmulh z11.s, z11.s, z20.s\n"
+ ".inst 0x448f4024 // smlalb z4.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f41bf // smlalb z31.s, p4/M, z13.h, z15.h\n"
+ "and z24.d, z5.d, z23.d\n"
+ ".inst 0x448f4428 // smlalt z8.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f45aa // smlalt z10.s, p4/M, z13.h, z15.h\n"
+ "and z19.d, z11.d, z6.d\n"
+ ".inst 0x04bc77de // sqrdmulh z30.s, z30.s, z28.s\n"
+ ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ ".inst 0x04bc7484 // sqrdmulh z4.s, z4.s, z28.s\n"
+ ".inst 0x04bc77ff // sqrdmulh z31.s, z31.s, z28.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z7.d, z30.d, z23.d\n"
+ "sqadd z5.s, z5.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ "and z15.d, z4.d, z23.d\n"
+ "and z24.d, z31.d, z23.d\n"
+ ".inst 0x04b4754a // sqrdmulh z10.s, z10.s, z20.s\n"
+ "sqadd z11.s, z11.s, z19.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z18.d, z16.d, z6.d\n"
+ ".inst 0x448292e5 // srshl z5.s, p4/M, z5.s, z23.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "and z13.d, z8.d, z6.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "and z3.d, z10.d, z6.d\n"
+ ".inst 0x448290cb // srshl z11.s, p4/M, z11.s, z6.s\n"
+ "sqadd z30.s, z30.s, z7.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z15.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z24.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292fe // srshl z30.s, p4/M, z30.s, z23.s\n"
+ "sqadd z16.s, z16.s, z18.s\n"
".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
- ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
- ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
- ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
- "sqadd z8.h, z8.h, z19.h\n"
- "smax z8.h, p4/M, z8.h, z12.h\n"
- "smin z8.h, p4/M, z8.h, z9.h\n"
- "sqadd z18.h, z18.h, z19.h\n"
- "sqadd z15.h, z15.h, z19.h\n"
- "smax z18.h, p4/M, z18.h, z12.h\n"
- "smax z15.h, p4/M, z15.h, z12.h\n"
- "sqadd z5.h, z5.h, z19.h\n"
- "smax z5.h, p4/M, z5.h, z12.h\n"
- "smin z18.h, p4/M, z18.h, z9.h\n"
- "st1b { z8.h }, p0, [x16, x10]\n"
- "smin z15.h, p4/M, z15.h, z9.h\n"
+ ".inst 0x448292e4 // srshl z4.s, p4/M, z4.s, z23.s\n"
+ "sqadd z8.s, z8.s, z13.s\n"
+ ".inst 0x448292ff // srshl z31.s, p4/M, z31.s, z23.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x453043de // sqxtnb z30.h, z30.s\n"
+ ".inst 0x448290d0 // srshl z16.s, p4/M, z16.s, z6.s\n"
+ ".inst 0x45304084 // sqxtnb z4.h, z4.s\n"
+ ".inst 0x45304565 // sqxtnt z5.h, z11.s\n"
+ ".inst 0x448290c8 // srshl z8.s, p4/M, z8.s, z6.s\n"
+ ".inst 0x448290ca // srshl z10.s, p4/M, z10.s, z6.s\n"
+ ".inst 0x453043ff // sqxtnb z31.h, z31.s\n"
+ ".inst 0x4530461e // sqxtnt z30.h, z16.s\n"
+ ".inst 0x45304504 // sqxtnt z4.h, z8.s\n"
+ ".inst 0x4530455f // sqxtnt z31.h, z10.s\n"
+ "sqadd z5.h, z5.h, z25.h\n"
+ "sqadd z30.h, z30.h, z25.h\n"
+ "sqadd z4.h, z4.h, z25.h\n"
+ "sqadd z31.h, z31.h, z25.h\n"
+ "smax z5.h, p4/M, z5.h, z14.h\n"
+ "smax z30.h, p4/M, z30.h, z14.h\n"
+ "smax z4.h, p4/M, z4.h, z14.h\n"
+ "smax z31.h, p4/M, z31.h, z14.h\n"
"smin z5.h, p4/M, z5.h, z9.h\n"
- "st1b { z18.h }, p0, [x15, x10]\n"
- "st1b { z15.h }, p0, [x14, x10]\n"
- "st1b { z5.h }, p0, [x13, x10]\n"
- "ld1b { z25.h }, p4/Z, [x17]\n"
- "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "inch x10\n"
- "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
- "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
- "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1b { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
+ "smin z30.h, p4/M, z30.h, z9.h\n"
+ "smin z4.h, p4/M, z4.h, z9.h\n"
+ "smin z31.h, p4/M, z31.h, z9.h\n"
+ "st1b { z5.h }, p0, [x11, x14]\n"
+ "st1b { z30.h }, p0, [x10, x14]\n"
+ "st1b { z4.h }, p0, [x9, x14]\n"
+ "st1b { z31.h }, p0, [x28, x14]\n"
+ "inch x14\n"
+ "ld1b { z28.h }, p4/Z, [x16]\n"
+ "ld1b { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1b { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1b { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1b { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20, #1, MUL VL]\n"
"addvl x20, x20, #2\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a52 // usublb z18.h, z18.b, z12.b\n"
+ "ld1b { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z10.s, z1.s\n"
+ "uzp2 z11.s, z10.s, z1.s\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z18.d, z8.d\n"
- "mov z0.d, z24.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z21.h }, p3/Z, [x27, x7]\n"
- "mov z15.d, z8.d\n"
- "mov z1.d, z24.d\n"
- "ld1b { z22.h }, p3/Z, [x26, x7]\n"
- "ld1b { z11.h }, p3/Z, [x25, x7]\n"
- "mov z5.d, z8.d\n"
- "mov z6.d, z24.d\n"
- "ld1b { z20.h }, p3/Z, [x24, x7]\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
- ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- "ld1b { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
- ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c1b5a // usublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c1ab5 // usublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1b { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1b { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1b { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511a73 // usublb z19.h, z19.b, z17.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index b8adbb8262..5604760aa3 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -112,533 +112,533 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"mov x2, #0x0\n"
- "mov x24, x2\n"
- "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "incw x24\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
- "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z30.b }, p4/Z, [x21]\n"
- "ld1rb { z10.b }, p4/Z, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x6, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x2\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z14.b }, p4/Z, [x20]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z10.h }, p4/Z, [x22]\n"
+ "incw x24\n"
"ld1rh { z15.h }, p4/Z, [x21]\n"
- "ld1rh { z12.h }, p4/Z, [x20]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
"ld1rh { z13.h }, p4/Z, [x20]\n"
- "ldp x5, x6, [x22, #0x0]\n"
"whilelt p3.h, x2, x3\n"
- "ldp x7, x8, [x22, #0x10]\n"
+ "ldp x17, x16, [x26, #0x0]\n"
+ "ldp x15, x14, [x26, #0x10]\n"
"whilelt p2.s, x2, x3\n"
"whilelt p1.s, x24, x3\n"
- "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
- "add x17, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z17.s }, p2/Z, [x10]\n"
- "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1b { z26.h }, p4/Z, [x4]\n"
- "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
- "addvl x10, x10, #2\n"
- "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "mov x16, #0x0\n"
- "mov z6.d, z14.d\n"
- "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z18.d, z23.d\n"
- "mov z9.d, z14.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z20.d, z23.d\n"
- "mov z7.d, z14.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z1.d, z23.d\n"
- ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
- "ld1b { z22.h }, p3/Z, [x9, x2]\n"
- "ld1b { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
- ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
- "ld1b { z11.h }, p3/Z, [x27, x2]\n"
- "ld1b { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
- ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
- "ld1b { z29.h }, p3/Z, [x25, x2]\n"
- "ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x2]\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- "ld1b { z19.h }, p3/Z, [x21, x2]\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x10, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "ld1w { z5.s }, p2/Z, [x25]\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ "ld1b { z25.h }, p4/Z, [x4]\n"
+ "ld1b { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1b { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z5.s, z16.s\n"
+ "uzp2 z30.s, z5.s, z16.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454c1b39 // usublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c1af7 // usublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1b { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1b { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
"1:" // Loop
- ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
- ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
- "ldr x20, [x17, #0x50]\n"
- "ld1b { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
- "ldr x20, [x17, #0x58]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
- ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
- "ld1b { z5.h }, p3/Z, [x20, x2]\n"
- "ldr x20, [x17, #0x60]\n"
- ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
- "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
- ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
- ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
- ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
- "ldr x20, [x17, #0x68]\n"
+ ".inst 0x44994346 // smlalb z6.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x4499475e // smlalt z30.s, p4/M, z26.h, z25.h\n"
+ "ldr x23, [x5, #0x50]\n"
+ "ldr x22, [x5, #0x58]\n"
+ ".inst 0x44994211 // smlalb z17.s, p4/M, z16.h, z25.h\n"
+ ".inst 0x44994315 // smlalb z21.s, p4/M, z24.h, z25.h\n"
+ "ldr x21, [x5, #0x60]\n"
+ "ld1b { z0.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x449940a7 // smlalb z7.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994608 // smlalt z8.s, p4/M, z16.h, z25.h\n"
+ "ldr x20, [x5, #0x68]\n"
+ "ld1b { z26.h }, p4/Z, [x4, #6, MUL VL]\n"
+ "ld1b { z2.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x4499471b // smlalt z27.s, p4/M, z24.h, z25.h\n"
+ ".inst 0x449944a9 // smlalt z9.s, p4/M, z5.h, z25.h\n"
+ "ld1b { z22.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x449c4206 // smlalb z6.s, p4/M, z16.h, z28.h\n"
+ ".inst 0x449c461e // smlalt z30.s, p4/M, z16.h, z28.h\n"
+ "ld1b { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x449c4251 // smlalb z17.s, p4/M, z18.h, z28.h\n"
+ ".inst 0x449c40b5 // smlalb z21.s, p4/M, z5.h, z28.h\n"
+ "ld1b { z16.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c1b5a // usublb z26.h, z26.b, z12.b\n"
+ ".inst 0x449c4067 // smlalb z7.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e1842 // usublb z2.h, z2.b, z14.b\n"
+ ".inst 0x449c4648 // smlalt z8.s, p4/M, z18.h, z28.h\n"
+ "ldr x20, [x5, #0x70]\n"
+ ".inst 0x449c44bb // smlalt z27.s, p4/M, z5.h, z28.h\n"
+ ".inst 0x449c4469 // smlalt z9.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e1ad6 // usublb z22.h, z22.b, z14.b\n"
+ "ld1b { z28.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x44844246 // smlalb z6.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x4484465e // smlalt z30.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44844271 // smlalb z17.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x44844075 // smlalb z21.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44844047 // smlalb z7.s, p4/M, z2.h, z4.h\n"
+ ".inst 0x44844668 // smlalt z8.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ldr x20, [x5, #0x78]\n"
+ ".inst 0x4484447b // smlalt z27.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x44844449 // smlalt z9.s, p4/M, z2.h, z4.h\n"
+ "ld1b { z18.h }, p4/Z, [x4]\n"
+ "ldr x22, [x5, #0x80]\n"
+ ".inst 0x44974266 // smlalb z6.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x4497467e // smlalt z30.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x454e1b39 // usublb z25.h, z25.b, z14.b\n"
+ "ld1b { z4.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974055 // smlalb z21.s, p4/M, z2.h, z23.h\n"
+ "ld1b { z19.h }, p3/Z, [x20, x2]\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x454c1a52 // usublb z18.h, z18.b, z12.b\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x4497445b // smlalt z27.s, p4/M, z2.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1b { z23.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x449f4166 // smlalb z6.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x449f457e // smlalt z30.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ "ld1b { z11.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x449f4031 // smlalb z17.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x449f42d5 // smlalb z21.s, p4/M, z22.h, z31.h\n"
+ "ldr x23, [x5, #0x98]\n"
+ "ldr x22, [x5, #0xa0]\n"
+ ".inst 0x449f4287 // smlalb z7.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x449f4428 // smlalt z8.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x454e1af7 // usublb z23.h, z23.b, z14.b\n"
+ "ld1b { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x449f46db // smlalt z27.s, p4/M, z22.h, z31.h\n"
+ ".inst 0x449f4689 // smlalt z9.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x454c196b // usublb z11.h, z11.b, z12.b\n"
+ "ld1b { z31.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480471e // smlalt z30.s, p4/M, z24.h, z0.h\n"
+ "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x448043b5 // smlalb z21.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ "ldr x21, [x5, #0xb0]\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x13, [x5, #0xb8]\n"
+ ".inst 0x448047bb // smlalt z27.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ "ld1b { z0.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ "ld1b { z5.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldr x12, [x5, #0xc0]\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "ldr x10, [x5, #0xd0]\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x454e1800 // usublb z0.h, z0.b, z14.b\n"
+ "ldr x9, [x5, #0xd8]\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ "ld1b { z26.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ "ld1b { z3.h }, p4/Z, [x4, #5, MUL VL]\n"
+ "ldr x28, [x5, #0xe0]\n"
+ ".inst 0x449c4051 // smlalb z17.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "ldr x26, [x5, #0xf0]\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4448 // smlalt z8.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ "ldr x25, [x5, #0xf8]\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x44924046 // smlalb z6.s, p4/M, z2.h, z18.h\n"
+ ".inst 0x4492445e // smlalt z30.s, p4/M, z2.h, z18.h\n"
"ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
- ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
- ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
- "ldr x20, [x17, #0x70]\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
- "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
- ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
- ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
- ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "ldr x24, [x5, #0x100]\n"
+ ".inst 0x449242d1 // smlalb z17.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x44924275 // smlalb z21.s, p4/M, z19.h, z18.h\n"
+ "ldr x23, [x5, #0x108]\n"
+ "ldr x22, [x5, #0x110]\n"
+ ".inst 0x449242e7 // smlalb z7.s, p4/M, z23.h, z18.h\n"
+ ".inst 0x449246c8 // smlalt z8.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x454e1b9c // usublb z28.h, z28.b, z14.b\n"
+ "ldr x20, [x5, #0x118]\n"
+ ".inst 0x4492467b // smlalt z27.s, p4/M, z19.h, z18.h\n"
+ ".inst 0x449246e9 // smlalt z9.s, p4/M, z23.h, z18.h\n"
+ "ld1b { z18.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446de // smlalt z30.s, p4/M, z22.h, z4.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- "ld1b { z8.h }, p3/Z, [x20, x2]\n"
- ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
- ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
- ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
- "ldr x20, [x17, #0x78]\n"
- ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
- ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
- "ld1b { z24.h }, p4/Z, [x4]\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
- ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
- ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
- ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
- "ldr x22, [x17, #0x80]\n"
+ ".inst 0x44844291 // smlalb z17.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x448442f5 // smlalb z21.s, p4/M, z23.h, z4.h\n"
+ "whilelt p0.h, x6, x3\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44844027 // smlalb z7.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x44844688 // smlalt z8.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x448446fb // smlalt z27.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44844429 // smlalt z9.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x454c1ad6 // usublb z22.h, z22.b, z12.b\n"
+ "ld1b { z4.h }, p4/Z, [x4]\n"
+ ".inst 0x448b43a6 // smlalb z6.s, p4/M, z29.h, z11.h\n"
+ ".inst 0x448b47be // smlalt z30.s, p4/M, z29.h, z11.h\n"
+ "ld1b { z29.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x448b4211 // smlalb z17.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x448b4315 // smlalb z21.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x448b4007 // smlalb z7.s, p4/M, z0.h, z11.h\n"
+ ".inst 0x448b4608 // smlalt z8.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448b471b // smlalt z27.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x448b4409 // smlalt z9.s, p4/M, z0.h, z11.h\n"
+ "ld1b { z11.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
+ ".inst 0x449f4206 // smlalb z6.s, p4/M, z16.h, z31.h\n"
+ ".inst 0x449f461e // smlalt z30.s, p4/M, z16.h, z31.h\n"
"ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
- ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- "ldr x21, [x17, #0x88]\n"
- ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
- ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
- ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
- "ldr x20, [x17, #0x90]\n"
- ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
- ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
- "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
- ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
- ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
- "ldr x23, [x17, #0x98]\n"
- "ldr x22, [x17, #0xa0]\n"
- ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
- ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
- "ld1b { z11.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
- ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
- "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
- ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
- ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
- "ld1b { z17.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
- ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
- "ldr x20, [x17, #0xa8]\n"
- "ldr x21, [x17, #0xb0]\n"
- ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
- ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
- "ldr x13, [x17, #0xb8]\n"
- "ldr x12, [x17, #0xc0]\n"
- ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
- ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
- "ld1b { z3.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
- ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
- "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
- "ldr x11, [x17, #0xc8]\n"
- "ldr x10, [x17, #0xd0]\n"
- ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
- ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
- "ldr x9, [x17, #0xd8]\n"
- "ldr x28, [x17, #0xe0]\n"
- ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
- ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
- "ld1b { z4.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
- ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449f4331 // smlalb z17.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x449f4015 // smlalb z21.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4347 // smlalb z7.s, p4/M, z26.h, z31.h\n"
+ ".inst 0x449f4728 // smlalt z8.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x449f441b // smlalt z27.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4749 // smlalt z9.s, p4/M, z26.h, z31.h\n"
+ "ld1b { z31.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x454c1a10 // usublb z16.h, z16.b, z12.b\n"
+ ".inst 0x44854326 // smlalb z6.s, p4/M, z25.h, z5.h\n"
+ ".inst 0x4485473e // smlalt z30.s, p4/M, z25.h, z5.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x44854271 // smlalb z17.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854387 // smlalb z7.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854668 // smlalt z8.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x454e1bff // usublb z31.h, z31.b, z14.b\n"
+ ".inst 0x4485475b // smlalt z27.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z5.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x454c1b39 // usublb z25.h, z25.b, z12.b\n"
+ ".inst 0x44834266 // smlalb z6.s, p4/M, z19.h, z3.h\n"
+ ".inst 0x4483467e // smlalt z30.s, p4/M, z19.h, z3.h\n"
+ "ld1b { z19.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x448342f1 // smlalb z17.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834247 // smlalb z7.s, p4/M, z18.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x4483479b // smlalt z27.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834649 // smlalt z9.s, p4/M, z18.h, z3.h\n"
+ "ld1b { z3.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x448242e6 // smlalb z6.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246fe // smlalt z30.s, p4/M, z23.h, z2.h\n"
+ "ld1b { z23.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x44824031 // smlalb z17.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x44824255 // smlalb z21.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824287 // smlalb z7.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x44824428 // smlalt z8.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ "ld1b { z1.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x4482465b // smlalt z27.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824689 // smlalt z9.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x454c1af7 // usublb z23.h, z23.b, z12.b\n"
"ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
- ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
- "ldr x27, [x17, #0xe8]\n"
- "ldr x26, [x17, #0xf0]\n"
- ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
- "ldr x25, [x17, #0xf8]\n"
- "ldr x24, [x17, #0x100]\n"
- ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
- ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
- ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
- "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
- ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
- "ldr x23, [x17, #0x108]\n"
- "ldr x22, [x17, #0x110]\n"
- ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
- ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
- "ldr x20, [x17, #0x118]\n"
- "whilelt p0.h, x16, x3\n"
- ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
- ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
- "ld1b { z5.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
- ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
- "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
- "inch x4, ALL, MUL #8\n"
- ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
- ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
- ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
- "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
- ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
- "ld1b { z28.h }, p3/Z, [x13, x2]\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
- ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
- ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
- "ld1b { z19.h }, p4/Z, [x4]\n"
- ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
- ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
- ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
- "ld1b { z16.h }, p3/Z, [x12, x2]\n"
- ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
- ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
- ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
- ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
- ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
- ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
- "ld1b { z26.h }, p3/Z, [x11, x2]\n"
- ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
- ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
- "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
- ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
- ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
- ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
- ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
- ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
- "ld1b { z8.h }, p3/Z, [x10, x2]\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
- ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
- "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
- ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
- ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
- ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
- ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
- ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
- ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x2]\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
- ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
- "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
- ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
- ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
- "ld1b { z0.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
- "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
- ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
- ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
- "ld1b { z17.h }, p3/Z, [x27, x2]\n"
- ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
- ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
- ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
- "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a196b // usublb z11.h, z11.b, z10.b\n"
- ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
- ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
- ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
- ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
- ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
- "ld1b { z3.h }, p3/Z, [x25, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
- "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
- ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
- ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
- ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
- ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
- ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44964306 // smlalb z6.s, p4/M, z24.h, z22.h\n"
+ ".inst 0x4496471e // smlalt z30.s, p4/M, z24.h, z22.h\n"
+ "ld1b { z24.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x44964011 // smlalb z17.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x449643b5 // smlalb z21.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ ".inst 0x44964167 // smlalb z7.s, p4/M, z11.h, z22.h\n"
+ ".inst 0x44964408 // smlalt z8.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n"
+ ".inst 0x449647bb // smlalt z27.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x44964569 // smlalt z9.s, p4/M, z11.h, z22.h\n"
+ "ld1b { z22.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x44844006 // smlalb z6.s, p4/M, z0.h, z4.h\n"
+ ".inst 0x4484441e // smlalt z30.s, p4/M, z0.h, z4.h\n"
+ "ld1b { z0.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44844351 // smlalb z17.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844175 // smlalb z21.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448443e7 // smlalb z7.s, p4/M, z31.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x454e1ad6 // usublb z22.h, z22.b, z14.b\n"
+ ".inst 0x4484457b // smlalt z27.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448447e9 // smlalt z9.s, p4/M, z31.h, z4.h\n"
"ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
- "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x44904346 // smlalb z6.s, p4/M, z26.h, z16.h\n"
+ ".inst 0x4490475e // smlalt z30.s, p4/M, z26.h, z16.h\n"
+ "ld1b { z26.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
- ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
- ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
- ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
- ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
- ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
- ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
- "ld1b { z21.h }, p4/Z, [x4]\n"
- ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
- ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
- ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
- "inch x4\n"
- ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
- ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
- ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
- ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
- "ld1b { z5.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44904391 // smlalb z17.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x449043f5 // smlalb z21.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449040a7 // smlalb z7.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x44904788 // smlalt z8.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x454e1884 // usublb z4.h, z4.b, z14.b\n"
+ ".inst 0x449047fb // smlalt z27.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449044a9 // smlalt z9.s, p4/M, z5.h, z16.h\n"
+ "ld1b { z16.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454c1b5a // usublb z26.h, z26.b, z12.b\n"
".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
- "ld1w { z22.s }, p2/Z, [x15]\n"
- ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
- ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
- ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
- "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
- "addvl x15, x15, #2\n"
- ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
- ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
- "uzp1 z25.s, z22.s, z16.s\n"
+ ".inst 0x4499479e // smlalt z30.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z28.h }, p4/Z, [x4]\n"
+ "inch x4\n"
+ ".inst 0x44994251 // smlalb z17.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x449940b5 // smlalb z21.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994067 // smlalb z7.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994648 // smlalt z8.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ ".inst 0x449944bb // smlalt z27.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994469 // smlalt z9.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x44934246 // smlalb z6.s, p4/M, z18.h, z19.h\n"
+ ".inst 0x4493465e // smlalt z30.s, p4/M, z18.h, z19.h\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ ".inst 0x44934291 // smlalb z17.s, p4/M, z20.h, z19.h\n"
+ ".inst 0x44934075 // smlalb z21.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934027 // smlalb z7.s, p4/M, z1.h, z19.h\n"
+ ".inst 0x44934688 // smlalt z8.s, p4/M, z20.h, z19.h\n"
+ "ld1w { z20.s }, p1/Z, [x7, #1, MUL VL]\n"
+ ".inst 0x454e1b39 // usublb z25.h, z25.b, z14.b\n"
+ ".inst 0x4493447b // smlalt z27.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934429 // smlalt z9.s, p4/M, z1.h, z19.h\n"
+ "ld1b { z19.h }, p3/Z, [x20, x2]\n"
"inch x2\n"
- ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
- ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
- "uzp2 z16.s, z22.s, z16.s\n"
- "ld1w { z22.s }, p2/Z, [x14]\n"
- ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
- ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449743a6 // smlalb z6.s, p4/M, z29.h, z23.h\n"
+ ".inst 0x449747be // smlalt z30.s, p4/M, z29.h, z23.h\n"
+ "addvl x7, x7, #2\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974315 // smlalb z21.s, p4/M, z24.h, z23.h\n"
+ "uzp1 z29.s, z18.s, z20.s\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ "uzp2 z18.s, z18.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x8]\n"
+ ".inst 0x4497471b // smlalt z27.s, p4/M, z24.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z24.s }, p1/Z, [x8, #1, MUL VL]\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x44824166 // smlalb z6.s, p4/M, z11.h, z2.h\n"
+ ".inst 0x4482457e // smlalt z30.s, p4/M, z11.h, z2.h\n"
"mov x20, x2\n"
- "incw x20\n"
- ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
- "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z29.s, z22.s, z26.s\n"
- ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
- ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
- "uzp2 z22.s, z22.s, z26.s\n"
"whilelt p2.s, x2, x3\n"
- ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x448243f1 // smlalb z17.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448242d5 // smlalb z21.s, p4/M, z22.h, z2.h\n"
+ "addvl x8, x8, #2\n"
+ ".inst 0x44824087 // smlalb z7.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "uzp1 z23.s, z20.s, z24.s\n"
+ ".inst 0x448246db // smlalt z27.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x44824489 // smlalt z9.s, p4/M, z4.h, z2.h\n"
+ "uzp2 z22.s, z20.s, z24.s\n"
+ "incw x20\n"
+ ".inst 0x448043e6 // smlalb z6.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047fe // smlalt z30.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x44804095 // smlalb z21.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
"whilelt p1.s, x20, x3\n"
"whilelt p3.h, x2, x3\n"
- ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
- ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
- "addvl x14, x14, #2\n"
- ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
- ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
- ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
- ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
- ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
- ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
- ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
- "and z3.d, z14.d, z29.d\n"
- ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
- ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
- "asr z3.s, z3.s, #0x1f\n"
- ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
- "sqadd z14.s, z14.s, z3.s\n"
- ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
- "and z31.d, z23.d, z22.d\n"
- ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
- ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
- ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
- ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
- ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
- ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
- ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
- ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
- ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
- "asr z31.s, z31.s, #0x1f\n"
- "and z3.d, z6.d, z29.d\n"
- ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
- "and z0.d, z9.d, z29.d\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- "and z19.d, z7.d, z29.d\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
- "sqadd z23.s, z23.s, z31.s\n"
- ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "and z21.d, z18.d, z22.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z17.d, z20.d, z22.d\n"
+ ".inst 0x4480449b // smlalt z27.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c4031 // smlalb z17.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4428 // smlalt z8.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x04bd74c6 // sqrdmulh z6.s, z6.s, z29.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04bd7631 // sqrdmulh z17.s, z17.s, z29.s\n"
+ ".inst 0x04bd76b5 // sqrdmulh z21.s, z21.s, z29.s\n"
+ "and z19.d, z6.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
+ "and z16.d, z30.d, z22.d\n"
+ "and z2.d, z17.d, z23.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z20.d, z21.d, z23.d\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z19.s\n"
+ "and z19.d, z7.d, z23.d\n"
+ "and z0.d, z8.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z30.s, z30.s, z16.s\n"
+ "and z26.d, z27.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z16.d, z1.d, z22.d\n"
- "sqadd z6.s, z6.s, z3.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
- "sqadd z9.s, z9.s, z0.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "and z16.d, z9.d, z22.d\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
+ "sqadd z17.s, z17.s, z2.s\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z20.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ ".inst 0x448292de // srshl z30.s, p4/M, z30.s, z22.s\n"
"sqadd z7.s, z7.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "sqadd z20.s, z20.s, z17.s\n"
- ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
- ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x448292f1 // srshl z17.s, p4/M, z17.s, z23.s\n"
+ "sqadd z8.s, z8.s, z0.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x448292f5 // srshl z21.s, p4/M, z21.s, z23.s\n"
+ "sqadd z27.s, z27.s, z26.s\n"
+ ".inst 0x448292e7 // srshl z7.s, p4/M, z7.s, z23.s\n"
+ "sqadd z9.s, z9.s, z16.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x448292c8 // srshl z8.s, p4/M, z8.s, z22.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x453047c6 // sqxtnt z6.h, z30.s\n"
+ ".inst 0x448292db // srshl z27.s, p4/M, z27.s, z22.s\n"
+ ".inst 0x448292c9 // srshl z9.s, p4/M, z9.s, z22.s\n"
".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
- ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
- ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
- ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
- ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
- "sqadd z14.h, z14.h, z15.h\n"
- "smax z14.h, p4/M, z14.h, z12.h\n"
- "smin z14.h, p4/M, z14.h, z13.h\n"
- "sqadd z6.h, z6.h, z15.h\n"
- "sqadd z9.h, z9.h, z15.h\n"
- "smax z6.h, p4/M, z6.h, z12.h\n"
- "smax z9.h, p4/M, z9.h, z12.h\n"
- "sqadd z7.h, z7.h, z15.h\n"
- "smax z7.h, p4/M, z7.h, z12.h\n"
+ ".inst 0x45304511 // sqxtnt z17.h, z8.s\n"
+ ".inst 0x45304775 // sqxtnt z21.h, z27.s\n"
+ ".inst 0x45304527 // sqxtnt z7.h, z9.s\n"
+ "sqadd z6.h, z6.h, z10.h\n"
+ "sqadd z17.h, z17.h, z10.h\n"
+ "sqadd z21.h, z21.h, z10.h\n"
+ "sqadd z7.h, z7.h, z10.h\n"
+ "smax z6.h, p4/M, z6.h, z15.h\n"
+ "smax z17.h, p4/M, z17.h, z15.h\n"
+ "smax z21.h, p4/M, z21.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z15.h\n"
"smin z6.h, p4/M, z6.h, z13.h\n"
- "st1b { z14.h }, p0, [x5, x16]\n"
- "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z17.h, p4/M, z17.h, z13.h\n"
+ "smin z21.h, p4/M, z21.h, z13.h\n"
"smin z7.h, p4/M, z7.h, z13.h\n"
- "st1b { z6.h }, p0, [x6, x16]\n"
- "st1b { z9.h }, p0, [x7, x16]\n"
- "st1b { z7.h }, p0, [x8, x16]\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
+ "st1b { z6.h }, p0, [x17, x6]\n"
+ "st1b { z17.h }, p0, [x16, x6]\n"
+ "st1b { z21.h }, p0, [x15, x6]\n"
+ "st1b { z7.h }, p0, [x14, x6]\n"
+ "inch x6\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
"ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1b { z26.h }, p4/Z, [x4]\n"
- "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
"addvl x21, x21, #2\n"
- "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "inch x16\n"
+ "ld1b { z25.h }, p4/Z, [x4]\n"
+ "ld1b { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1b { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z21.s, z16.s\n"
+ "uzp2 z30.s, z21.s, z16.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z6.d, z14.d\n"
- "mov z18.d, z23.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z9.d, z14.d\n"
- "mov z20.d, z23.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z7.d, z14.d\n"
- "mov z1.d, z23.d\n"
- "ld1b { z22.h }, p3/Z, [x9, x2]\n"
- "ld1b { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
- ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
- "ld1b { z11.h }, p3/Z, [x27, x2]\n"
- "ld1b { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
- ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
- "ld1b { z29.h }, p3/Z, [x25, x2]\n"
- "ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x2]\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- "ld1b { z19.h }, p3/Z, [x21, x2]\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x454c1b39 // usublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c1af7 // usublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1b { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1b { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
index a9cd8a7fa9..e782eb3197 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,288 +41,288 @@ void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl
)
{
__asm__ __volatile__(
- "mov x20, #0x9\n"
- "whilelt p0.b, XZR, x20\n"
- "ldr x23, [%x[inptrs], #0x8]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov x25, #0x9\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "ldr x23, [%x[inptrs], #0x10]\n"
+ "mov z22.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x20]\n"
"ldr x21, [%x[inptrs], #0x0]\n"
- "mov z13.b, #0x1\n"
- "lsr z13.s, z13.s, #0x8\n"
- "ld1b { z1.b }, p0/Z, [x23]\n"
- "ld1b { z2.b }, p0/Z, [x20]\n"
- "mov z8.d, z1.d\n"
- "mov z27.d, z1.d\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
"ldr x20, [%x[inptrs], #0x18]\n"
- "ld1b { z4.b }, p0/Z, [x22]\n"
- "mov z31.d, z1.d\n"
- "mov z28.d, z2.d\n"
- "ld1b { z0.b }, p0/Z, [x21]\n"
- "mov z30.d, z2.d\n"
- "mov z26.d, z2.d\n"
- "ld1b { z3.b }, p0/Z, [x20]\n"
- "mov z22.d, z4.d\n"
- "mov z10.d, z4.d\n"
+ "lsr z22.s, z22.s, #0x8\n"
+ "mov z29.s, #0x0\n"
"ptrue p2.b\n"
- "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "mov z18.d, z4.d\n"
- "ext z8.b, z8.b, z8.b, #0x2\n"
+ "whilelt p0.b, XZR, x25\n"
+ "mov z14.s, #0x0\n"
+ "mov z23.s, #0x0\n"
"lsl x10, %x[n_channels], #0x2\n"
- "neg z11.s, p2/M, z11.s\n"
- "ext z27.b, z27.b, z27.b, #0x4\n"
- "ext z31.b, z31.b, z31.b, #0x6\n"
+ "mov z11.s, #0x0\n"
+ "mov z15.s, #0x0\n"
"mov x9, #0x0\n"
- "whilelt p0.b, x9, x10\n"
- "ext z28.b, z28.b, z28.b, #0x2\n"
- "ext z30.b, z30.b, z30.b, #0x4\n"
- "ld1w { z14.s }, p0/Z, [%x[params]]\n"
"mov x28, #0x0\n"
- "ext z26.b, z26.b, z26.b, #0x6\n"
- "ext z22.b, z22.b, z22.b, #0x2\n"
+ "mov z31.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "ld1b { z2.b }, p0/Z, [x23]\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z24.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z27.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "neg z16.s, p2/M, z16.s\n"
+ "mov z5.d, z1.d\n"
+ "mov z7.d, z1.d\n"
+ "whilelt p0.b, x9, x10\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
- "ext z10.b, z10.b, z10.b, #0x4\n"
- "ext z18.b, z18.b, z18.b, #0x6\n"
+ "mov z30.d, z1.d\n"
+ "mov z6.d, z2.d\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "mov z21.d, z0.d\n"
- "mov z20.d, z0.d\n"
- "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "mov z19.d, z0.d\n"
- "mov z24.d, z3.d\n"
- "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z8.d, z2.d\n"
+ "mov z19.d, z2.d\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z9.d, z4.d\n"
+ "mov z28.d, z4.d\n"
+ "ext z5.b, z5.b, z5.b, #0x2\n"
+ "ext z7.b, z7.b, z7.b, #0x4\n"
+ "ext z30.b, z30.b, z30.b, #0x6\n"
+ "ext z6.b, z6.b, z6.b, #0x2\n"
+ "ext z8.b, z8.b, z8.b, #0x4\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "ext z9.b, z9.b, z9.b, #0x2\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
+ "zip1 z1.s, z1.s, z7.s\n"
+ "mov z7.d, z4.d\n"
+ "zip1 z5.s, z5.s, z30.s\n"
+ "mov z30.d, z0.d\n"
+ "ext z7.b, z7.b, z7.b, #0x6\n"
+ "zip1 z2.s, z2.s, z8.s\n"
+ "ld1w { z8.s }, p0/Z, [%x[params]]\n"
+ "ext z30.b, z30.b, z30.b, #0x2\n"
+ "zip1 z6.s, z6.s, z19.s\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "zip1 z4.s, z4.s, z28.s\n"
+ "mov z28.d, z0.d\n"
+ "zip1 z9.s, z9.s, z7.s\n"
+ "mov z7.d, z0.d\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
+ "zip1 z1.s, z1.s, z5.s\n"
"ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
- "mov z17.d, z3.d\n"
- "mov z16.d, z3.d\n"
+ "ext z7.b, z7.b, z7.b, #0x6\n"
+ "zip1 z2.s, z2.s, z6.s\n"
"ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z4.s, z4.s, z9.s\n"
+ "mov z9.d, z3.d\n"
+ "zip1 z0.s, z0.s, z28.s\n"
+ "mov z28.d, z3.d\n"
+ "ext z9.b, z9.b, z9.b, #0x2\n"
+ "zip1 z30.s, z30.s, z7.s\n"
"ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
- "ext z21.b, z21.b, z21.b, #0x2\n"
- "ext z20.b, z20.b, z20.b, #0x4\n"
"addvl %x[params], %x[params], #4\n"
- "ext z19.b, z19.b, z19.b, #0x6\n"
- "zip1 z1.s, z1.s, z27.s\n"
- "zip1 z8.s, z8.s, z31.s\n"
- "zip1 z2.s, z2.s, z30.s\n"
- "zip1 z28.s, z28.s, z26.s\n"
- "ext z24.b, z24.b, z24.b, #0x2\n"
- "ext z17.b, z17.b, z17.b, #0x4\n"
- "ext z16.b, z16.b, z16.b, #0x6\n"
- "zip1 z4.s, z4.s, z10.s\n"
- "zip1 z22.s, z22.s, z18.s\n"
- "zip1 z0.s, z0.s, z20.s\n"
- "zip1 z21.s, z21.s, z19.s\n"
- "zip1 z1.s, z1.s, z8.s\n"
- "zip1 z2.s, z2.s, z28.s\n"
- "zip1 z3.s, z3.s, z17.s\n"
- "zip1 z24.s, z24.s, z16.s\n"
- "zip1 z4.s, z4.s, z22.s\n"
- "zip1 z0.s, z0.s, z21.s\n"
+ "ext z28.b, z28.b, z28.b, #0x4\n"
"mov z1.q, z1.q[0]\n"
"mov z2.q, z2.q[0]\n"
- "zip1 z3.s, z3.s, z24.s\n"
"mov z4.q, z4.q[0]\n"
- "mov z24.s, #0x0\n"
- "mov z25.s, #0x0\n"
- "udot z24.s, z13.b, z1.b[0]\n"
- "mov z23.s, #0x0\n"
- "mov z22.s, #0x0\n"
- "udot z25.s, z13.b, z1.b[1]\n"
- "mov z21.s, #0x0\n"
- "mov z19.s, #0x0\n"
- "udot z23.s, z13.b, z1.b[2]\n"
- "mov z10.s, #0x0\n"
- "mov z8.s, #0x0\n"
- "udot z22.s, z13.b, z1.b[3]\n"
- "mov z20.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "udot z21.s, z13.b, z2.b[0]\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "udot z19.s, z13.b, z2.b[1]\n"
- "udot z10.s, z13.b, z2.b[2]\n"
- "udot z8.s, z13.b, z2.b[3]\n"
+ "zip1 z0.s, z0.s, z30.s\n"
+ "mov z30.d, z3.d\n"
+ "udot z25.s, z22.b, z1.b[0]\n"
+ "zip1 z3.s, z3.s, z28.s\n"
+ "udot z26.s, z22.b, z1.b[1]\n"
+ "udot z29.s, z22.b, z1.b[2]\n"
+ "ext z30.b, z30.b, z30.b, #0x6\n"
+ "udot z14.s, z22.b, z1.b[3]\n"
+ "udot z23.s, z22.b, z2.b[0]\n"
+ "udot z11.s, z22.b, z2.b[1]\n"
+ "udot z15.s, z22.b, z2.b[2]\n"
"mov z0.q, z0.q[0]\n"
- "udot z20.s, z13.b, z4.b[0]\n"
- "udot z18.s, z13.b, z4.b[1]\n"
- "mov z3.q, z3.q[0]\n"
- "udot z17.s, z13.b, z4.b[2]\n"
- "udot z16.s, z13.b, z4.b[3]\n"
- "mov z31.s, #0x0\n"
- "mov z30.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "udot z31.s, z13.b, z0.b[0]\n"
- "mov z27.s, #0x0\n"
+ "udot z31.s, z22.b, z2.b[3]\n"
+ "udot z17.s, z22.b, z4.b[0]\n"
"mov z28.s, #0x0\n"
- "udot z30.s, z13.b, z0.b[1]\n"
- "mov z29.s, #0x0\n"
- "udot z26.s, z13.b, z0.b[2]\n"
- "udot z27.s, z13.b, z0.b[3]\n"
- "udot z28.s, z13.b, z3.b[0]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "add z24.s, z24.s, z21.s\n"
- "add z25.s, z25.s, z19.s\n"
- "add z23.s, z23.s, z10.s\n"
- "add z22.s, z22.s, z8.s\n"
- "add z21.s, z20.s, z21.s\n"
+ "zip1 z9.s, z9.s, z30.s\n"
+ "udot z20.s, z22.b, z4.b[1]\n"
+ "udot z21.s, z22.b, z4.b[2]\n"
+ "udot z24.s, z22.b, z4.b[3]\n"
+ "mov z30.s, #0x0\n"
+ "udot z12.s, z22.b, z0.b[0]\n"
+ "udot z27.s, z22.b, z0.b[1]\n"
+ "udot z18.s, z22.b, z0.b[2]\n"
+ "add z25.s, z25.s, z23.s\n"
+ "zip1 z3.s, z3.s, z9.s\n"
+ "mov z9.s, #0x0\n"
+ "udot z28.s, z22.b, z0.b[3]\n"
+ "add z26.s, z26.s, z11.s\n"
+ "add z29.s, z29.s, z15.s\n"
+ "add z14.s, z14.s, z31.s\n"
+ "add z23.s, z17.s, z23.s\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z17.s, #0x0\n"
+ "add z11.s, z20.s, z11.s\n"
"mov z20.s, #0x0\n"
- "udot z20.s, z13.b, z3.b[2]\n"
- "add z19.s, z18.s, z19.s\n"
- "mov z18.s, #0x0\n"
- "udot z18.s, z13.b, z3.b[3]\n"
- "add z17.s, z17.s, z10.s\n"
- "add z16.s, z16.s, z8.s\n"
- "add z24.s, z24.s, z31.s\n"
- "add z25.s, z25.s, z30.s\n"
- "mul z24.s, p2/M, z24.s, z11.s\n"
- "mul z25.s, p2/M, z25.s, z11.s\n"
- "add z26.s, z23.s, z26.s\n"
- "add z27.s, z22.s, z27.s\n"
- "mul z26.s, p2/M, z26.s, z11.s\n"
- "mul z27.s, p2/M, z27.s, z11.s\n"
- "add z28.s, z21.s, z28.s\n"
- "add z29.s, z19.s, z29.s\n"
- "mul z28.s, p2/M, z28.s, z11.s\n"
- "mul z29.s, p2/M, z29.s, z11.s\n"
- "add z30.s, z17.s, z20.s\n"
- "add z31.s, z16.s, z18.s\n"
- "mul z30.s, p2/M, z30.s, z11.s\n"
- "mul z31.s, p2/M, z31.s, z11.s\n"
- "zip1 z19.s, z24.s, z26.s\n"
- "zip1 z18.s, z25.s, z27.s\n"
+ "udot z30.s, z22.b, z3.b[0]\n"
+ "udot z9.s, z22.b, z3.b[1]\n"
+ "udot z17.s, z22.b, z3.b[2]\n"
+ "add z15.s, z21.s, z15.s\n"
+ "udot z20.s, z22.b, z3.b[3]\n"
+ "add z31.s, z24.s, z31.s\n"
+ "add z24.s, z25.s, z12.s\n"
+ "add z25.s, z26.s, z27.s\n"
+ "add z26.s, z29.s, z18.s\n"
+ "add z27.s, z14.s, z28.s\n"
+ "add z28.s, z23.s, z30.s\n"
+ "add z29.s, z11.s, z9.s\n"
+ "add z30.s, z15.s, z17.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "mul z24.s, p2/M, z24.s, z16.s\n"
+ "mul z25.s, p2/M, z25.s, z16.s\n"
+ "mul z26.s, p2/M, z26.s, z16.s\n"
+ "mul z27.s, p2/M, z27.s, z16.s\n"
+ "mul z28.s, p2/M, z28.s, z16.s\n"
+ "mul z29.s, p2/M, z29.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z16.s\n"
+ "mul z31.s, p2/M, z31.s, z16.s\n"
+ "zip1 z21.s, z24.s, z26.s\n"
+ "add z24.s, z24.s, z8.s\n"
+ "zip1 z23.s, z25.s, z27.s\n"
+ "add z25.s, z25.s, z8.s\n"
+ "add z26.s, z26.s, z8.s\n"
+ "add z27.s, z27.s, z8.s\n"
"zip1 z17.s, z28.s, z30.s\n"
"zip1 z16.s, z29.s, z31.s\n"
- "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z22.s, z21.s, z23.s\n"
+ "add z28.s, z28.s, z8.s\n"
+ "add z29.s, z29.s, z8.s\n"
+ "add z30.s, z30.s, z8.s\n"
"zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z14.s\n"
- "add z25.s, z25.s, z14.s\n"
- "add z26.s, z26.s, z14.s\n"
- "add z27.s, z27.s, z14.s\n"
- "add z28.s, z28.s, z14.s\n"
- "add z29.s, z29.s, z14.s\n"
- "add z30.s, z30.s, z14.s\n"
- "add z31.s, z31.s, z14.s\n"
+ "add z31.s, z31.s, z8.s\n"
"1:" // Loop
"udot z24.s, z5.b, z0.b[0]\n"
"udot z25.s, z5.b, z0.b[1]\n"
- "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z15.s }, p2/Z, [%x[params]]\n"
"ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
"udot z26.s, z5.b, z0.b[2]\n"
"udot z27.s, z5.b, z0.b[3]\n"
"incb x9\n"
"whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z28.s, z5.b, z2.b[0]\n"
+ "udot z29.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z2.b[2]\n"
+ "udot z31.s, z5.b, z2.b[3]\n"
"udot z24.s, z6.b, z1.b[0]\n"
"udot z25.s, z6.b, z1.b[1]\n"
"whilelt p0.b, x9, x10\n"
- "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
"udot z26.s, z6.b, z1.b[2]\n"
"udot z27.s, z6.b, z1.b[3]\n"
- "udot z28.s, z5.b, z2.b[0]\n"
- "udot z29.s, z5.b, z2.b[1]\n"
- "udot z30.s, z5.b, z2.b[2]\n"
- "udot z31.s, z5.b, z2.b[3]\n"
- "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
- "udot z24.s, z7.b, z2.b[0]\n"
- "udot z25.s, z7.b, z2.b[1]\n"
- ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
- "udot z26.s, z7.b, z2.b[2]\n"
- "udot z27.s, z7.b, z2.b[3]\n"
- ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
"udot z28.s, z6.b, z3.b[0]\n"
"udot z29.s, z6.b, z3.b[1]\n"
- ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
"udot z30.s, z6.b, z3.b[2]\n"
"udot z31.s, z6.b, z3.b[3]\n"
- ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[0]\n"
+ "udot z25.s, z7.b, z2.b[1]\n"
"ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
"udot z28.s, z7.b, z4.b[0]\n"
"udot z29.s, z7.b, z4.b[1]\n"
- "and z19.d, z24.d, z21.d\n"
"udot z30.s, z7.b, z4.b[2]\n"
"udot z31.s, z7.b, z4.b[3]\n"
- "and z18.d, z25.d, z21.d\n"
"ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ ".inst 0x04af7718 // sqrdmulh z24.s, z24.s, z15.s\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ ".inst 0x04af775a // sqrdmulh z26.s, z26.s, z15.s\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ ".inst 0x04af779c // sqrdmulh z28.s, z28.s, z15.s\n"
+ ".inst 0x04af77bd // sqrdmulh z29.s, z29.s, z15.s\n"
+ "and z14.d, z24.d, z21.d\n"
+ "and z12.d, z25.d, z21.d\n"
"and z17.d, z26.d, z21.d\n"
"and z16.d, z27.d, z21.d\n"
- "addvl %x[params], %x[params], #6\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04af77de // sqrdmulh z30.s, z30.s, z15.s\n"
+ ".inst 0x04af77ff // sqrdmulh z31.s, z31.s, z15.s\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ "asr z12.s, z12.s, #0x1f\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
- ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
- ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
- ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
- "sqadd z24.s, z24.s, z19.s\n"
- "sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z24.s, z24.s, z14.s\n"
+ "and z14.d, z28.d, z21.d\n"
+ "sqadd z25.s, z25.s, z12.s\n"
+ "and z11.d, z29.d, z21.d\n"
"sqadd z26.s, z26.s, z17.s\n"
"sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
"and z17.d, z30.d, z21.d\n"
"and z16.d, z31.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ "asr z14.s, z14.s, #0x1f\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
"asr z17.s, z17.s, #0x1f\n"
"asr z16.s, z16.s, #0x1f\n"
- "sqadd z28.s, z28.s, z19.s\n"
- "sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z24.s, z24.s, z10.s\n"
+ "sqadd z28.s, z28.s, z14.s\n"
+ "sqadd z29.s, z29.s, z11.s\n"
+ "add z25.s, z25.s, z10.s\n"
"sqadd z30.s, z30.s, z17.s\n"
"sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "add z26.s, z26.s, z10.s\n"
+ "add z27.s, z27.s, z10.s\n"
+ "smin z24.s, p2/M, z24.s, z19.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "smin z25.s, p2/M, z25.s, z19.s\n"
".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "add z24.s, z24.s, z9.s\n"
- "add z25.s, z25.s, z9.s\n"
- "smin z24.s, p2/M, z24.s, z12.s\n"
- "smin z25.s, p2/M, z25.s, z12.s\n"
- "add z26.s, z26.s, z9.s\n"
- "add z27.s, z27.s, z9.s\n"
- "smin z26.s, p2/M, z26.s, z12.s\n"
- "smin z27.s, p2/M, z27.s, z12.s\n"
- "add z28.s, z28.s, z9.s\n"
- "add z29.s, z29.s, z9.s\n"
- "smin z28.s, p2/M, z28.s, z12.s\n"
- "smin z29.s, p2/M, z29.s, z12.s\n"
- "add z30.s, z30.s, z9.s\n"
- "add z31.s, z31.s, z9.s\n"
- "smin z30.s, p2/M, z30.s, z12.s\n"
- "smin z31.s, p2/M, z31.s, z12.s\n"
- "smax z24.s, p2/M, z24.s, z15.s\n"
- "smax z25.s, p2/M, z25.s, z15.s\n"
+ "add z28.s, z28.s, z10.s\n"
+ "add z29.s, z29.s, z10.s\n"
+ "smin z26.s, p2/M, z26.s, z19.s\n"
+ "smin z27.s, p2/M, z27.s, z19.s\n"
+ "smax z24.s, p2/M, z24.s, z13.s\n"
+ "add z30.s, z30.s, z10.s\n"
+ "smax z25.s, p2/M, z25.s, z13.s\n"
+ "add z31.s, z31.s, z10.s\n"
+ "smin z28.s, p2/M, z28.s, z19.s\n"
+ "smin z29.s, p2/M, z29.s, z19.s\n"
+ "smax z26.s, p2/M, z26.s, z13.s\n"
+ "smin z30.s, p2/M, z30.s, z19.s\n"
+ "smax z27.s, p2/M, z27.s, z13.s\n"
"st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z15.s\n"
- "smax z27.s, p2/M, z27.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z19.s\n"
+ "smax z28.s, p2/M, z28.s, z13.s\n"
"st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z15.s\n"
- "smax z29.s, p2/M, z29.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z13.s\n"
"st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z15.s\n"
- "smax z31.s, p2/M, z31.s, z15.s\n"
+ "add z24.s, z24.s, z20.s\n"
+ "smax z30.s, p2/M, z30.s, z13.s\n"
"st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "smax z31.s, p2/M, z31.s, z13.s\n"
"st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z20.s\n"
+ "add z26.s, z26.s, z20.s\n"
"st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z20.s\n"
+ "add z27.s, z27.s, z20.s\n"
"st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
"st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z20.s\n"
- "add z28.s, z28.s, z20.s\n"
"add z29.s, z29.s, z20.s\n"
"add z30.s, z30.s, z20.s\n"
"add z31.s, z31.s, z20.s\n"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
index 4b65a67309..9149db7a0a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2023 Arm Limited.
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,353 +42,353 @@ void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl
{
__asm__ __volatile__(
"mov x20, #0x6\n"
- "whilelt p0.b, XZR, x20\n"
- "ldr x22, [%x[inptrs], #0x18]\n"
- "ldr x21, [%x[inptrs], #0x20]\n"
- "ldr x20, [%x[inptrs], #0x10]\n"
- "ld1b { z3.b }, p0/Z, [x22]\n"
- "mov z23.d, z3.d\n"
- "ext z23.b, z23.b, z23.b, #0x1\n"
- "ld1b { z4.b }, p0/Z, [x21]\n"
+ "ldr x27, [%x[inptrs], #0x18]\n"
+ "ldr x26, [%x[inptrs], #0x20]\n"
+ "mov z30.b, #0x1\n"
+ "ldr x25, [%x[inptrs], #0x10]\n"
"ldr x24, [%x[inptrs], #0x8]\n"
- "mov z18.d, z4.d\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z14.s, #0x0\n"
+ "mov z27.s, #0x0\n"
"ldr x23, [%x[inptrs], #0x28]\n"
- "mov z15.d, z2.d\n"
- "ext z15.b, z15.b, z15.b, #0x1\n"
"ldr x22, [%x[inptrs], #0x30]\n"
+ "mov z11.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "whilelt p0.b, XZR, x20\n"
"ldr x21, [%x[inptrs], #0x38]\n"
- "zip1 z3.d, z3.d, z23.d\n"
- "zip1 z4.d, z4.d, z18.d\n"
"ldr x20, [%x[inptrs], #0x0]\n"
+ "mov z28.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z21.s, #0x1\n"
+ "ptrue p2.b\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "mov z24.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "ld1b { z3.b }, p0/Z, [x27]\n"
+ "ld1b { z4.b }, p0/Z, [x26]\n"
+ "mov z31.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "ld1b { z2.b }, p0/Z, [x25]\n"
"ld1b { z1.b }, p0/Z, [x24]\n"
- "mov z19.d, z1.d\n"
- "ext z19.b, z19.b, z19.b, #0x1\n"
+ "mov z20.s, #0x0\n"
+ "mov z17.s, #0x0\n"
"ld1b { z5.b }, p0/Z, [x23]\n"
"ld1b { z6.b }, p0/Z, [x22]\n"
- "mov z18.d, z5.d\n"
- "mov z22.d, z6.d\n"
+ "mov z18.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z16.d, z3.d\n"
+ "mov z13.d, z4.d\n"
"ld1b { z7.b }, p0/Z, [x21]\n"
"ld1b { z0.b }, p0/Z, [x20]\n"
- "mov z8.d, z7.d\n"
- "zip1 z2.d, z2.d, z15.d\n"
- "mov z3.q, z3.q[0]\n"
- "mov z4.q, z4.q[0]\n"
- "ptrue p2.b\n"
- "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
- "ext z18.b, z18.b, z18.b, #0x1\n"
- "ext z22.b, z22.b, z22.b, #0x1\n"
- "lsl x10, %x[n_channels], #0x2\n"
- "neg z23.s, p2/M, z23.s\n"
- "ext z8.b, z8.b, z8.b, #0x1\n"
- "mov z28.b, #0x1\n"
- "mov x9, #0x0\n"
+ "mov z12.d, z2.d\n"
+ "mov z19.d, z1.d\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
"whilelt p0.b, x9, x10\n"
- "mov z25.s, #0x0\n"
- "mov z24.s, #0x0\n"
- "udot z25.s, z28.b, z3.b[0]\n"
- "ld1w { z12.s }, p0/Z, [%x[params]]\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0x0\n"
- "udot z24.s, z28.b, z3.b[2]\n"
- "mov x28, #0x0\n"
- "mov z27.d, z0.d\n"
- "udot z17.s, z28.b, z4.b[0]\n"
- "udot z16.s, z28.b, z4.b[2]\n"
+ "ext z16.b, z16.b, z16.b, #0x1\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
"ldp x27, x26, [%x[outptrs], #0x0]\n"
- "ext z27.b, z27.b, z27.b, #0x1\n"
- "zip1 z1.d, z1.d, z19.d\n"
"ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "mov z8.d, z5.d\n"
"ldp x23, x22, [%x[outptrs], #0x20]\n"
- "mov z2.q, z2.q[0]\n"
- "zip1 z5.d, z5.d, z18.d\n"
"ldp x21, x20, [%x[outptrs], #0x30]\n"
- "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
- "zip1 z6.d, z6.d, z22.d\n"
- "zip1 z7.d, z7.d, z8.d\n"
- "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
- "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
- "mov z30.s, #0x0\n"
- "mov z31.s, #0x0\n"
- "udot z30.s, z28.b, z2.b[0]\n"
+ "mov z10.d, z6.d\n"
+ "mov z9.d, z7.d\n"
+ "neg z15.s, p2/M, z15.s\n"
+ "zip1 z3.d, z3.d, z16.d\n"
+ "zip1 z4.d, z4.d, z13.d\n"
+ "ld1w { z13.s }, p0/Z, [%x[params]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "zip1 z2.d, z2.d, z12.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
+ "ld1rw { z19.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "udot z14.s, z30.b, z3.b[0]\n"
+ "udot z27.s, z30.b, z3.b[2]\n"
+ "udot z11.s, z30.b, z4.b[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "udot z22.s, z30.b, z4.b[2]\n"
+ "zip1 z5.d, z5.d, z8.d\n"
"ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
- "mov z29.s, #0x1\n"
- "udot z31.s, z28.b, z2.b[2]\n"
- "udot z25.s, z29.b, z3.b[1]\n"
+ "zip1 z6.d, z6.d, z10.d\n"
+ "mov z10.d, z0.d\n"
+ "udot z28.s, z30.b, z2.b[0]\n"
+ "zip1 z7.d, z7.d, z9.d\n"
+ "udot z25.s, z30.b, z2.b[2]\n"
+ "udot z14.s, z21.b, z3.b[1]\n"
"ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
- "zip1 z0.d, z0.d, z27.d\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
"mov z1.q, z1.q[0]\n"
- "udot z24.s, z29.b, z3.b[3]\n"
- "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "udot z27.s, z21.b, z3.b[3]\n"
"mov z5.q, z5.q[0]\n"
"mov z6.q, z6.q[0]\n"
- "udot z17.s, z29.b, z4.b[1]\n"
- "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "udot z11.s, z21.b, z4.b[1]\n"
"mov z7.q, z7.q[0]\n"
- "mov z22.s, #0x0\n"
- "udot z16.s, z29.b, z4.b[3]\n"
+ "udot z22.s, z21.b, z4.b[3]\n"
+ "udot z24.s, z30.b, z1.b[0]\n"
+ "zip1 z0.d, z0.d, z10.d\n"
+ "udot z23.s, z30.b, z1.b[2]\n"
+ "udot z31.s, z30.b, z5.b[0]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "udot z29.s, z30.b, z5.b[2]\n"
+ "udot z20.s, z30.b, z6.b[0]\n"
+ "udot z17.s, z30.b, z6.b[2]\n"
+ "udot z18.s, z30.b, z7.b[0]\n"
+ "add z14.s, z14.s, z11.s\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "udot z26.s, z30.b, z7.b[2]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z28.s, z21.b, z2.b[1]\n"
"addvl %x[params], %x[params], #5\n"
- "mov z21.s, #0x0\n"
- "mov z26.s, #0x0\n"
- "udot z22.s, z28.b, z1.b[0]\n"
+ "udot z25.s, z21.b, z2.b[3]\n"
+ "add z22.s, z27.s, z22.s\n"
+ "udot z24.s, z21.b, z1.b[1]\n"
"mov z27.s, #0x0\n"
- "mov z20.s, #0x0\n"
- "udot z21.s, z28.b, z1.b[2]\n"
- "mov z19.s, #0x0\n"
- "mov z18.s, #0x0\n"
- "udot z26.s, z28.b, z5.b[0]\n"
- "udot z27.s, z28.b, z5.b[2]\n"
- "udot z20.s, z28.b, z6.b[0]\n"
- "mov z0.q, z0.q[0]\n"
- "udot z19.s, z28.b, z6.b[2]\n"
- "udot z18.s, z28.b, z7.b[0]\n"
- "add z17.s, z25.s, z17.s\n"
- "mov z25.s, #0x0\n"
- "udot z25.s, z28.b, z7.b[2]\n"
- "udot z30.s, z29.b, z2.b[1]\n"
- "udot z31.s, z29.b, z2.b[3]\n"
- "add z16.s, z24.s, z16.s\n"
- "udot z22.s, z29.b, z1.b[1]\n"
- "mov z24.s, #0x0\n"
- "udot z24.s, z28.b, z0.b[0]\n"
- "udot z21.s, z29.b, z1.b[3]\n"
- "udot z26.s, z29.b, z5.b[1]\n"
- "udot z27.s, z29.b, z5.b[3]\n"
- "add z30.s, z30.s, z17.s\n"
- "udot z20.s, z29.b, z6.b[1]\n"
- "udot z19.s, z29.b, z6.b[3]\n"
- "add z31.s, z31.s, z16.s\n"
- "udot z18.s, z29.b, z7.b[1]\n"
- "udot z25.s, z29.b, z7.b[3]\n"
- "add z22.s, z22.s, z30.s\n"
- "udot z24.s, z29.b, z0.b[1]\n"
- "add z21.s, z21.s, z31.s\n"
- "add z20.s, z26.s, z20.s\n"
- "add z19.s, z27.s, z19.s\n"
- "add z18.s, z18.s, z17.s\n"
- "mov z17.s, #0x0\n"
- "udot z17.s, z28.b, z0.b[2]\n"
- "udot z17.s, z29.b, z0.b[3]\n"
- "add z16.s, z25.s, z16.s\n"
- "add z24.s, z22.s, z24.s\n"
- "add z25.s, z21.s, z17.s\n"
- "mul z24.s, p2/M, z24.s, z23.s\n"
- "mul z25.s, p2/M, z25.s, z23.s\n"
- "add z26.s, z26.s, z22.s\n"
- "add z27.s, z27.s, z21.s\n"
- "mul z26.s, p2/M, z26.s, z23.s\n"
- "mul z27.s, p2/M, z27.s, z23.s\n"
- "add z28.s, z20.s, z30.s\n"
- "add z29.s, z19.s, z31.s\n"
- "mul z28.s, p2/M, z28.s, z23.s\n"
- "mul z29.s, p2/M, z29.s, z23.s\n"
+ "udot z23.s, z21.b, z1.b[3]\n"
+ "udot z31.s, z21.b, z5.b[1]\n"
+ "udot z29.s, z21.b, z5.b[3]\n"
+ "udot z20.s, z21.b, z6.b[1]\n"
+ "udot z27.s, z30.b, z0.b[0]\n"
+ "udot z17.s, z21.b, z6.b[3]\n"
+ "add z28.s, z28.s, z14.s\n"
+ "udot z18.s, z21.b, z7.b[1]\n"
+ "udot z26.s, z21.b, z7.b[3]\n"
+ "add z25.s, z25.s, z22.s\n"
+ "add z24.s, z24.s, z28.s\n"
+ "add z20.s, z31.s, z20.s\n"
+ "udot z27.s, z21.b, z0.b[1]\n"
+ "add z23.s, z23.s, z25.s\n"
+ "add z17.s, z29.s, z17.s\n"
+ "add z18.s, z18.s, z14.s\n"
+ "mov z14.s, #0x0\n"
+ "add z22.s, z26.s, z22.s\n"
+ "add z26.s, z31.s, z24.s\n"
+ "udot z14.s, z30.b, z0.b[2]\n"
+ "add z24.s, z24.s, z27.s\n"
+ "add z27.s, z29.s, z23.s\n"
+ "add z28.s, z20.s, z28.s\n"
+ "add z29.s, z17.s, z25.s\n"
"add z30.s, z20.s, z18.s\n"
- "add z31.s, z19.s, z16.s\n"
- "mul z30.s, p2/M, z30.s, z23.s\n"
- "mul z31.s, p2/M, z31.s, z23.s\n"
- "zip1 z19.s, z24.s, z26.s\n"
- "zip1 z18.s, z25.s, z27.s\n"
- "zip1 z17.s, z28.s, z30.s\n"
- "zip1 z16.s, z29.s, z31.s\n"
- "zip1 z22.s, z19.s, z18.s\n"
- "zip1 z23.s, z17.s, z16.s\n"
- "add z24.s, z24.s, z12.s\n"
- "add z25.s, z25.s, z12.s\n"
- "add z26.s, z26.s, z12.s\n"
- "add z27.s, z27.s, z12.s\n"
- "add z28.s, z28.s, z12.s\n"
- "add z29.s, z29.s, z12.s\n"
- "add z30.s, z30.s, z12.s\n"
- "add z31.s, z31.s, z12.s\n"
+ "add z31.s, z17.s, z22.s\n"
+ "mul z26.s, p2/M, z26.s, z15.s\n"
+ "udot z14.s, z21.b, z0.b[3]\n"
+ "mul z24.s, p2/M, z24.s, z15.s\n"
+ "mul z27.s, p2/M, z27.s, z15.s\n"
+ "mul z28.s, p2/M, z28.s, z15.s\n"
+ "mul z29.s, p2/M, z29.s, z15.s\n"
+ "mul z30.s, p2/M, z30.s, z15.s\n"
+ "mul z31.s, p2/M, z31.s, z15.s\n"
+ "add z25.s, z23.s, z14.s\n"
+ "zip1 z21.s, z24.s, z26.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "mul z25.s, p2/M, z25.s, z15.s\n"
+ "zip1 z22.s, z28.s, z30.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "zip1 z18.s, z29.s, z31.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "zip1 z14.s, z25.s, z27.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "zip1 z23.s, z22.s, z18.s\n"
+ "add z31.s, z31.s, z13.s\n"
+ "zip1 z22.s, z21.s, z14.s\n"
"1:" // Loop
"udot z24.s, z8.b, z0.b[0]\n"
"udot z25.s, z8.b, z0.b[2]\n"
- "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
- "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[params], #7, MUL VL]\n"
"udot z26.s, z8.b, z1.b[0]\n"
"udot z27.s, z8.b, z1.b[2]\n"
"incb x9\n"
"whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z28.s, z8.b, z2.b[0]\n"
+ "udot z29.s, z8.b, z2.b[2]\n"
+ "udot z30.s, z8.b, z3.b[0]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
+ "ld1b { z15.b }, p2/Z, [%x[params]]\n"
"udot z24.s, z9.b, z0.b[1]\n"
"udot z25.s, z9.b, z0.b[3]\n"
"whilelt p0.b, x9, x10\n"
"udot z26.s, z9.b, z1.b[1]\n"
"udot z27.s, z9.b, z1.b[3]\n"
- "udot z28.s, z8.b, z2.b[0]\n"
- "udot z29.s, z8.b, z2.b[2]\n"
- "udot z30.s, z8.b, z3.b[0]\n"
- "udot z31.s, z8.b, z3.b[2]\n"
- "ld1b { z17.b }, p2/Z, [%x[params]]\n"
- "udot z24.s, z10.b, z1.b[0]\n"
- "udot z25.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z27.s, z10.b, z2.b[2]\n"
"udot z28.s, z9.b, z2.b[1]\n"
"udot z29.s, z9.b, z2.b[3]\n"
"udot z30.s, z9.b, z3.b[1]\n"
"udot z31.s, z9.b, z3.b[3]\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
- "udot z24.s, z11.b, z1.b[1]\n"
- "udot z25.s, z11.b, z1.b[3]\n"
- "udot z26.s, z11.b, z2.b[1]\n"
- "udot z27.s, z11.b, z2.b[3]\n"
+ "ld1b { z8.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z24.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
"udot z28.s, z10.b, z3.b[0]\n"
"udot z29.s, z10.b, z3.b[2]\n"
"udot z30.s, z10.b, z4.b[0]\n"
"udot z31.s, z10.b, z4.b[2]\n"
- "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
- "udot z24.s, z17.b, z2.b[0]\n"
- "udot z25.s, z17.b, z2.b[2]\n"
- "udot z26.s, z17.b, z3.b[0]\n"
- "udot z27.s, z17.b, z3.b[2]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "udot z24.s, z11.b, z1.b[1]\n"
+ "udot z25.s, z11.b, z1.b[3]\n"
+ "udot z26.s, z11.b, z2.b[1]\n"
+ "udot z27.s, z11.b, z2.b[3]\n"
"udot z28.s, z11.b, z3.b[1]\n"
"udot z29.s, z11.b, z3.b[3]\n"
"udot z30.s, z11.b, z4.b[1]\n"
"udot z31.s, z11.b, z4.b[3]\n"
"ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
- "udot z24.s, z16.b, z2.b[1]\n"
- "udot z25.s, z16.b, z2.b[3]\n"
- "udot z26.s, z16.b, z3.b[1]\n"
- "udot z27.s, z16.b, z3.b[3]\n"
- "udot z28.s, z17.b, z4.b[0]\n"
- "udot z29.s, z17.b, z4.b[2]\n"
- "udot z30.s, z17.b, z5.b[0]\n"
- "udot z31.s, z17.b, z5.b[2]\n"
+ "udot z24.s, z15.b, z2.b[0]\n"
+ "udot z25.s, z15.b, z2.b[2]\n"
+ "udot z26.s, z15.b, z3.b[0]\n"
+ "udot z27.s, z15.b, z3.b[2]\n"
+ "udot z28.s, z15.b, z4.b[0]\n"
+ "udot z29.s, z15.b, z4.b[2]\n"
+ "udot z30.s, z15.b, z5.b[0]\n"
+ "udot z31.s, z15.b, z5.b[2]\n"
"ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
- "udot z24.s, z19.b, z3.b[0]\n"
- "udot z25.s, z19.b, z3.b[2]\n"
- "udot z26.s, z19.b, z4.b[0]\n"
- "udot z27.s, z19.b, z4.b[2]\n"
- "udot z28.s, z16.b, z4.b[1]\n"
- "udot z29.s, z16.b, z4.b[3]\n"
- "udot z30.s, z16.b, z5.b[1]\n"
- "udot z31.s, z16.b, z5.b[3]\n"
- "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "udot z25.s, z8.b, z2.b[3]\n"
+ "udot z26.s, z8.b, z3.b[1]\n"
+ "udot z27.s, z8.b, z3.b[3]\n"
+ "udot z28.s, z8.b, z4.b[1]\n"
+ "udot z29.s, z8.b, z4.b[3]\n"
+ "udot z30.s, z8.b, z5.b[1]\n"
+ "udot z31.s, z8.b, z5.b[3]\n"
+ "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n"
"addvl %x[params], %x[params], #16\n"
+ "udot z24.s, z21.b, z3.b[0]\n"
+ "udot z25.s, z21.b, z3.b[2]\n"
+ "udot z26.s, z21.b, z4.b[0]\n"
+ "udot z27.s, z21.b, z4.b[2]\n"
+ "udot z28.s, z21.b, z5.b[0]\n"
+ "udot z29.s, z21.b, z5.b[2]\n"
+ "ld1w { z14.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z30.s, z21.b, z6.b[0]\n"
+ "udot z31.s, z21.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
"udot z24.s, z18.b, z3.b[1]\n"
"udot z25.s, z18.b, z3.b[3]\n"
- "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
"udot z26.s, z18.b, z4.b[1]\n"
"udot z27.s, z18.b, z4.b[3]\n"
- "udot z28.s, z19.b, z5.b[0]\n"
- "udot z29.s, z19.b, z5.b[2]\n"
- "udot z30.s, z19.b, z6.b[0]\n"
- "udot z31.s, z19.b, z6.b[2]\n"
- "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
- "udot z24.s, z17.b, z4.b[0]\n"
- "udot z25.s, z17.b, z4.b[2]\n"
- "udot z26.s, z17.b, z5.b[0]\n"
- "udot z27.s, z17.b, z5.b[2]\n"
"udot z28.s, z18.b, z5.b[1]\n"
"udot z29.s, z18.b, z5.b[3]\n"
"udot z30.s, z18.b, z6.b[1]\n"
"udot z31.s, z18.b, z6.b[3]\n"
"ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
- "udot z24.s, z16.b, z4.b[1]\n"
- "udot z25.s, z16.b, z4.b[3]\n"
- ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
- "udot z26.s, z16.b, z5.b[1]\n"
- "udot z27.s, z16.b, z5.b[3]\n"
- ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "udot z24.s, z17.b, z4.b[0]\n"
+ "udot z25.s, z17.b, z4.b[2]\n"
+ "udot z26.s, z17.b, z5.b[0]\n"
+ "udot z27.s, z17.b, z5.b[2]\n"
"udot z28.s, z17.b, z6.b[0]\n"
"udot z29.s, z17.b, z6.b[2]\n"
- ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
"udot z30.s, z17.b, z7.b[0]\n"
"udot z31.s, z17.b, z7.b[2]\n"
- ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
"ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
- "udot z28.s, z16.b, z6.b[1]\n"
- "udot z29.s, z16.b, z6.b[3]\n"
- "and z19.d, z24.d, z21.d\n"
- "udot z30.s, z16.b, z7.b[1]\n"
- "udot z31.s, z16.b, z7.b[3]\n"
- "and z18.d, z25.d, z21.d\n"
+ "udot z24.s, z9.b, z4.b[1]\n"
+ "udot z25.s, z9.b, z4.b[3]\n"
+ "udot z26.s, z9.b, z5.b[1]\n"
+ "udot z27.s, z9.b, z5.b[3]\n"
+ "udot z28.s, z9.b, z6.b[1]\n"
+ "udot z29.s, z9.b, z6.b[3]\n"
+ "udot z30.s, z9.b, z7.b[1]\n"
+ "udot z31.s, z9.b, z7.b[3]\n"
"ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
- "and z17.d, z26.d, z21.d\n"
- "and z16.d, z27.d, z21.d\n"
"addvl %x[params], %x[params], #-3\n"
- "asr z19.s, z19.s, #0x1f\n"
- "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b47718 // sqrdmulh z24.s, z24.s, z20.s\n"
+ ".inst 0x04b47739 // sqrdmulh z25.s, z25.s, z20.s\n"
+ ".inst 0x04b4775a // sqrdmulh z26.s, z26.s, z20.s\n"
+ ".inst 0x04b4777b // sqrdmulh z27.s, z27.s, z20.s\n"
+ ".inst 0x04b4779c // sqrdmulh z28.s, z28.s, z20.s\n"
+ ".inst 0x04b477bd // sqrdmulh z29.s, z29.s, z20.s\n"
+ "and z17.d, z24.d, z13.d\n"
+ "and z18.d, z25.d, z13.d\n"
+ "and z15.d, z26.d, z13.d\n"
+ "and z21.d, z27.d, z13.d\n"
+ ".inst 0x04b477de // sqrdmulh z30.s, z30.s, z20.s\n"
+ ".inst 0x04b477ff // sqrdmulh z31.s, z31.s, z20.s\n"
"asr z17.s, z17.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
- ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
- ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
- ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
- "sqadd z24.s, z24.s, z19.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z17.s\n"
+ "and z20.d, z28.d, z13.d\n"
"sqadd z25.s, z25.s, z18.s\n"
- ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
- ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
- "sqadd z26.s, z26.s, z17.s\n"
- "sqadd z27.s, z27.s, z16.s\n"
- ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
- ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
- "and z19.d, z28.d, z21.d\n"
- "and z18.d, z29.d, z21.d\n"
- "and z17.d, z30.d, z21.d\n"
- "and z16.d, z31.d, z21.d\n"
- "asr z19.s, z19.s, #0x1f\n"
+ "and z18.d, z29.d, z13.d\n"
+ "sqadd z26.s, z26.s, z15.s\n"
+ "sqadd z27.s, z27.s, z21.s\n"
+ "and z17.d, z30.d, z13.d\n"
+ "and z15.d, z31.d, z13.d\n"
+ ".inst 0x448289b8 // srshl z24.s, p2/M, z24.s, z13.s\n"
+ "asr z20.s, z20.s, #0x1f\n"
"asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x448289b9 // srshl z25.s, p2/M, z25.s, z13.s\n"
+ ".inst 0x448289ba // srshl z26.s, p2/M, z26.s, z13.s\n"
+ ".inst 0x448289bb // srshl z27.s, p2/M, z27.s, z13.s\n"
"asr z17.s, z17.s, #0x1f\n"
- "asr z16.s, z16.s, #0x1f\n"
- "sqadd z28.s, z28.s, z19.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "add z24.s, z24.s, z16.s\n"
+ "sqadd z28.s, z28.s, z20.s\n"
"sqadd z29.s, z29.s, z18.s\n"
- ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
- ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "add z25.s, z25.s, z16.s\n"
"sqadd z30.s, z30.s, z17.s\n"
- "sqadd z31.s, z31.s, z16.s\n"
- ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
- ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
- "add z24.s, z24.s, z13.s\n"
- "add z25.s, z25.s, z13.s\n"
- "smin z24.s, p2/M, z24.s, z15.s\n"
- "smin z25.s, p2/M, z25.s, z15.s\n"
- "add z26.s, z26.s, z13.s\n"
- "add z27.s, z27.s, z13.s\n"
- "smin z26.s, p2/M, z26.s, z15.s\n"
- "smin z27.s, p2/M, z27.s, z15.s\n"
- "add z28.s, z28.s, z13.s\n"
- "add z29.s, z29.s, z13.s\n"
- "smin z28.s, p2/M, z28.s, z15.s\n"
- "smin z29.s, p2/M, z29.s, z15.s\n"
- "add z30.s, z30.s, z13.s\n"
- "add z31.s, z31.s, z13.s\n"
- "smin z30.s, p2/M, z30.s, z15.s\n"
- "smin z31.s, p2/M, z31.s, z15.s\n"
- "smax z24.s, p2/M, z24.s, z14.s\n"
- "smax z25.s, p2/M, z25.s, z14.s\n"
+ "sqadd z31.s, z31.s, z15.s\n"
+ ".inst 0x448289bc // srshl z28.s, p2/M, z28.s, z13.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "smin z24.s, p2/M, z24.s, z19.s\n"
+ ".inst 0x448289bd // srshl z29.s, p2/M, z29.s, z13.s\n"
+ "smin z25.s, p2/M, z25.s, z19.s\n"
+ ".inst 0x448289be // srshl z30.s, p2/M, z30.s, z13.s\n"
+ ".inst 0x448289bf // srshl z31.s, p2/M, z31.s, z13.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "smin z26.s, p2/M, z26.s, z19.s\n"
+ "smin z27.s, p2/M, z27.s, z19.s\n"
+ "smax z24.s, p2/M, z24.s, z12.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "smax z25.s, p2/M, z25.s, z12.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z19.s\n"
+ "smin z29.s, p2/M, z29.s, z19.s\n"
+ "smax z26.s, p2/M, z26.s, z12.s\n"
+ "smin z30.s, p2/M, z30.s, z19.s\n"
+ "smax z27.s, p2/M, z27.s, z12.s\n"
"st1b { z24.s }, p1, [x27, x28]\n"
"mov z24.s, z22.s[0]\n"
- "smax z26.s, p2/M, z26.s, z14.s\n"
- "smax z27.s, p2/M, z27.s, z14.s\n"
+ "smin z31.s, p2/M, z31.s, z19.s\n"
+ "smax z28.s, p2/M, z28.s, z12.s\n"
"st1b { z25.s }, p1, [x26, x28]\n"
"mov z25.s, z22.s[1]\n"
- "smax z28.s, p2/M, z28.s, z14.s\n"
- "smax z29.s, p2/M, z29.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z12.s\n"
"st1b { z26.s }, p1, [x25, x28]\n"
"mov z26.s, z22.s[2]\n"
- "smax z30.s, p2/M, z30.s, z14.s\n"
- "smax z31.s, p2/M, z31.s, z14.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "smax z30.s, p2/M, z30.s, z12.s\n"
"st1b { z27.s }, p1, [x24, x28]\n"
"mov z27.s, z22.s[3]\n"
+ "add z25.s, z25.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z12.s\n"
"st1b { z28.s }, p1, [x23, x28]\n"
"mov z28.s, z23.s[0]\n"
- "add z24.s, z24.s, z20.s\n"
+ "add z26.s, z26.s, z14.s\n"
"st1b { z29.s }, p1, [x22, x28]\n"
"mov z29.s, z23.s[1]\n"
- "add z25.s, z25.s, z20.s\n"
+ "add z27.s, z27.s, z14.s\n"
"st1b { z30.s }, p1, [x21, x28]\n"
"mov z30.s, z23.s[2]\n"
- "add z26.s, z26.s, z20.s\n"
+ "add z28.s, z28.s, z14.s\n"
"st1b { z31.s }, p1, [x20, x28]\n"
"mov z31.s, z23.s[3]\n"
"incw x28\n"
- "add z27.s, z27.s, z20.s\n"
- "add z28.s, z28.s, z20.s\n"
- "add z29.s, z29.s, z20.s\n"
- "add z30.s, z30.s, z20.s\n"
- "add z31.s, z31.s, z20.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
"b.any 1b\n"
: [params] "+&r" (params)
: [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 887eccf1e9..b4b2a3a673 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[16];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -91,316 +91,316 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x16, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x17, #0x0\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x16\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
"ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z12.b }, p4/Z, [x21]\n"
- "ld1rb { z30.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z24.h }, p4/Z, [x22]\n"
- "ld1rh { z11.h }, p4/Z, [x21]\n"
- "ld1rh { z26.h }, p4/Z, [x20]\n"
- "ldp x13, x12, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x16, x15\n"
- "ldp x11, x10, [x24, #0x10]\n"
- "whilelt p2.s, x16, x15\n"
- "whilelt p1.s, x23, x15\n"
- "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z14.h }, p4/Z, [x14]\n"
- "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "add x28, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x27, #0x0\n"
- "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x12, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x17\n"
+ "add x20, x26, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x26, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x26, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x26, %[offsetof_Requantize32_minval]\n"
+ "add x20, x26, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z15.b }, p4/Z, [x23]\n"
+ "ld1rh { z26.h }, p4/Z, [x22]\n"
+ "ld1rh { z2.h }, p4/Z, [x21]\n"
+ "ld1rh { z14.h }, p4/Z, [x20]\n"
+ "incw x24\n"
+ "whilelt p3.h, x17, x15\n"
+ "ldp x9, x28, [x16, #0x0]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "whilelt p2.s, x17, x15\n"
+ "whilelt p1.s, x24, x15\n"
+ "ld1sb { z13.h }, p4/Z, [x14]\n"
+ "ld1sb { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
- "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
- "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1sb { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1sb { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
- "addvl x9, x9, #2\n"
- "mov z17.d, z5.d\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z25.d, z9.d\n"
- "mov z16.d, z5.d\n"
- "ld1b { z0.h }, p3/Z, [x24, x16]\n"
- "ld1b { z29.h }, p3/Z, [x23, x16]\n"
- "mov z23.d, z9.d\n"
- "mov z22.d, z5.d\n"
- "ld1b { z4.h }, p3/Z, [x22, x16]\n"
- "ld1b { z13.h }, p3/Z, [x21, x16]\n"
- "mov z27.d, z9.d\n"
- ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
- "ld1b { z20.h }, p3/Z, [x20, x16]\n"
- "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x9, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454f11ad // ssublb z13.h, z13.b, z15.b\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "ld1w { z24.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454f116b // ssublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1252 // ssublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ "ld1sb { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1294 // ssublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z19.s, z24.s\n"
+ "uzp2 z16.s, z19.s, z24.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1231 // ssublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a1b7b // usublb z27.h, z27.b, z10.b\n"
"1:" // Loop
- ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
- "ldr x20, [x28, #0x28]\n"
- "ldr x21, [x28, #0x38]\n"
- ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
- ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
- "ld1b { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x30]\n"
- ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
- ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
- "ld1b { z31.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
- ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
- "ldr x21, [x28, #0x40]\n"
- "ld1b { z15.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- "ldr x20, [x28, #0x48]\n"
- ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
- ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
- "ld1b { z19.h }, p3/Z, [x21, x16]\n"
- ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
- ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
- ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
- ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
- ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
- "ldr x21, [x28, #0x50]\n"
- "ldr x20, [x28, #0x58]\n"
- ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
- ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
- ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
- "ld1b { z4.h }, p3/Z, [x21, x16]\n"
- ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
- ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
- "ld1b { z29.h }, p3/Z, [x20, x16]\n"
- "ldr x21, [x28, #0x60]\n"
- ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
- ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
- "ldr x20, [x28, #0x68]\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
- ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- "ld1b { z0.h }, p3/Z, [x21, x16]\n"
- ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
- ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
- "ld1b { z3.h }, p3/Z, [x20, x16]\n"
- "ldr x20, [x28, #0x70]\n"
- ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
- ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- "ld1b { z13.h }, p3/Z, [x20, x16]\n"
- ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
- ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
- ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
- "ldr x20, [x28, #0x78]\n"
- ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
- ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- "whilelt p0.h, x27, x15\n"
- ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
- ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
- "ld1w { z20.s }, p2/Z, [x26]\n"
+ ".inst 0x449440e3 // smlalb z3.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x449444f0 // smlalt z16.s, p4/M, z7.h, z20.h\n"
+ "ldr x25, [x13, #0x28]\n"
+ "ldr x24, [x13, #0x38]\n"
+ ".inst 0x448640e8 // smlalb z8.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x448b40e0 // smlalb z0.s, p4/M, z7.h, z11.h\n"
+ "ldr x23, [x13, #0x30]\n"
+ "ldr x22, [x13, #0x40]\n"
+ ".inst 0x448d40f3 // smlalb z19.s, p4/M, z7.h, z13.h\n"
+ ".inst 0x448644f5 // smlalt z21.s, p4/M, z7.h, z6.h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "ld1b { z22.h }, p3/Z, [x25, x17]\n"
+ ".inst 0x448b44fd // smlalt z29.s, p4/M, z7.h, z11.h\n"
+ ".inst 0x448d44e9 // smlalt z9.s, p4/M, z7.h, z13.h\n"
+ "ld1b { z31.h }, p3/Z, [x24, x17]\n"
+ ".inst 0x448d4303 // smlalb z3.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z25.h }, p3/Z, [x22, x17]\n"
+ ".inst 0x44924088 // smlalb z8.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x44924020 // smlalb z0.s, p4/M, z1.h, z18.h\n"
+ "ld1b { z23.h }, p3/Z, [x20, x17]\n"
+ "ldr x20, [x13, #0x58]\n"
+ ".inst 0x448b4033 // smlalb z19.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44924495 // smlalt z21.s, p4/M, z4.h, z18.h\n"
+ "ld1b { z12.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x4492443d // smlalt z29.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448b4429 // smlalt z9.s, p4/M, z1.h, z11.h\n"
+ ".inst 0x454a1bff // usublb z31.h, z31.b, z10.b\n"
+ "ldr x21, [x13, #0x60]\n"
+ ".inst 0x449e4023 // smlalb z3.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x449e4430 // smlalt z16.s, p4/M, z1.h, z30.h\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ "ld1b { z4.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x44944028 // smlalb z8.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x449c42c0 // smlalb z0.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ "ldr x20, [x13, #0x68]\n"
+ ".inst 0x44864373 // smlalb z19.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x44944435 // smlalt z21.s, p4/M, z1.h, z20.h\n"
+ ".inst 0x454a1af7 // usublb z23.h, z23.b, z10.b\n"
+ "ld1b { z7.h }, p3/Z, [x21, x17]\n"
+ ".inst 0x449c46dd // smlalt z29.s, p4/M, z22.h, z28.h\n"
+ ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x454a198c // usublb z12.h, z12.b, z10.b\n"
+ "ldr x21, [x13, #0x70]\n"
+ ".inst 0x44914363 // smlalb z3.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x44914770 // smlalt z16.s, p4/M, z27.h, z17.h\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x20, x17]\n"
+ ".inst 0x449c4368 // smlalb z8.s, p4/M, z27.h, z28.h\n"
+ ".inst 0x44944360 // smlalb z0.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ "ldr x20, [x13, #0x78]\n"
+ ".inst 0x44854313 // smlalb z19.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x449c4775 // smlalt z21.s, p4/M, z27.h, z28.h\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "whilelt p0.h, x12, x15\n"
+ ".inst 0x4494477d // smlalt z29.s, p4/M, z27.h, z20.h\n"
+ ".inst 0x44854709 // smlalt z9.s, p4/M, z24.h, z5.h\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
+ ".inst 0x448b43e3 // smlalb z3.s, p4/M, z31.h, z11.h\n"
+ ".inst 0x448b47f0 // smlalt z16.s, p4/M, z31.h, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x11, #1, MUL VL]\n"
"inch x14\n"
- ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
- ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
- "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ ".inst 0x448d43e8 // smlalb z8.s, p4/M, z31.h, z13.h\n"
+ ".inst 0x449e42e0 // smlalb z0.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
"ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
- ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
- "addvl x26, x26, #2\n"
- ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
- ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
- ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
- ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x16]\n"
- ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
- ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
- ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
- "uzp1 z2.s, z20.s, z15.s\n"
- "inch x16\n"
- ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
- ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
- "uzp2 z15.s, z20.s, z15.s\n"
- "ld1w { z20.s }, p2/Z, [x25]\n"
- ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
- ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
- "mov x20, x16\n"
+ ".inst 0x449442f3 // smlalb z19.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x448d47f5 // smlalt z21.s, p4/M, z31.h, z13.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x17]\n"
+ "inch x17\n"
+ ".inst 0x449e46fd // smlalt z29.s, p4/M, z23.h, z30.h\n"
+ ".inst 0x449446e9 // smlalt z9.s, p4/M, z23.h, z20.h\n"
+ "uzp1 z20.s, z24.s, z27.s\n"
+ "addvl x11, x11, #2\n"
+ ".inst 0x44924323 // smlalb z3.s, p4/M, z25.h, z18.h\n"
+ ".inst 0x44924730 // smlalt z16.s, p4/M, z25.h, z18.h\n"
+ "uzp2 z24.s, z24.s, z27.s\n"
+ "ld1w { z27.s }, p2/Z, [x10]\n"
+ ".inst 0x448b4328 // smlalb z8.s, p4/M, z25.h, z11.h\n"
+ ".inst 0x448d4180 // smlalb z0.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x454a1bff // usublb z31.h, z31.b, z10.b\n"
+ "mov x20, x17\n"
+ ".inst 0x44924093 // smlalb z19.s, p4/M, z4.h, z18.h\n"
+ ".inst 0x448b4735 // smlalt z21.s, p4/M, z25.h, z11.h\n"
+ "ld1w { z25.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "whilelt p2.s, x17, x15\n"
+ ".inst 0x448d459d // smlalt z29.s, p4/M, z12.h, z13.h\n"
+ ".inst 0x44924489 // smlalt z9.s, p4/M, z4.h, z18.h\n"
+ "addvl x10, x10, #2\n"
+ ".inst 0x448542e3 // smlalb z3.s, p4/M, z23.h, z5.h\n"
+ ".inst 0x448546f0 // smlalt z16.s, p4/M, z23.h, z5.h\n"
"incw x20\n"
- ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
- ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
- "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
- "uzp1 z21.s, z20.s, z19.s\n"
- ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
- ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
- "uzp2 z1.s, z20.s, z19.s\n"
- "whilelt p2.s, x16, x15\n"
- ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
- ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449142e8 // smlalb z8.s, p4/M, z23.h, z17.h\n"
+ ".inst 0x448640e0 // smlalb z0.s, p4/M, z7.h, z6.h\n"
+ "uzp1 z11.s, z27.s, z25.s\n"
+ ".inst 0x449e42d3 // smlalb z19.s, p4/M, z22.h, z30.h\n"
+ ".inst 0x449146f5 // smlalt z21.s, p4/M, z23.h, z17.h\n"
+ "uzp2 z27.s, z27.s, z25.s\n"
+ ".inst 0x448644fd // smlalt z29.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x449e46c9 // smlalt z9.s, p4/M, z22.h, z30.h\n"
"whilelt p1.s, x20, x15\n"
- "whilelt p3.h, x16, x15\n"
- ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
- ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
- ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
- "addvl x25, x25, #2\n"
- ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
- ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
- "and z19.d, z5.d, z21.d\n"
- ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
- ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
- "asr z19.s, z19.s, #0x1f\n"
- ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
- "sqadd z5.s, z5.s, z19.s\n"
- ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
- ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
- ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
- ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
- ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
- ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
- "and z29.d, z9.d, z1.d\n"
- ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
- ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
- ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
- ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
- ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
- ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
- ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
- ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
- ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
- "asr z29.s, z29.s, #0x1f\n"
- "and z18.d, z17.d, z21.d\n"
- ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
- "and z20.d, z16.d, z21.d\n"
- ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
- "and z19.d, z22.d, z21.d\n"
- ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
- "sqadd z9.s, z9.s, z29.s\n"
- ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
- "asr z18.s, z18.s, #0x1f\n"
- "and z7.d, z25.d, z1.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z6.d, z23.d, z1.d\n"
- "asr z19.s, z19.s, #0x1f\n"
- "and z2.d, z27.d, z1.d\n"
- "sqadd z17.s, z17.s, z18.s\n"
- "asr z7.s, z7.s, #0x1f\n"
- ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
- "sqadd z16.s, z16.s, z20.s\n"
+ "whilelt p3.h, x17, x15\n"
+ ".inst 0x44864183 // smlalb z3.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x44864590 // smlalt z16.s, p4/M, z12.h, z6.h\n"
+ ".inst 0x449e4088 // smlalb z8.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x44914020 // smlalb z0.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4033 // smlalb z19.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449e4495 // smlalt z21.s, p4/M, z4.h, z30.h\n"
+ ".inst 0x4491443d // smlalt z29.s, p4/M, z1.h, z17.h\n"
+ ".inst 0x449c4429 // smlalt z9.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c40e3 // smlalb z3.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x449c44f0 // smlalt z16.s, p4/M, z7.h, z28.h\n"
+ ".inst 0x448542c8 // smlalb z8.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448543e0 // smlalb z0.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449143f3 // smlalb z19.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x448546d5 // smlalt z21.s, p4/M, z22.h, z5.h\n"
+ ".inst 0x448547fd // smlalt z29.s, p4/M, z31.h, z5.h\n"
+ ".inst 0x449147e9 // smlalt z9.s, p4/M, z31.h, z17.h\n"
+ ".inst 0x04b47463 // sqrdmulh z3.s, z3.s, z20.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ ".inst 0x04b47400 // sqrdmulh z0.s, z0.s, z20.s\n"
+ "and z4.d, z3.d, z11.d\n"
+ ".inst 0x04b47673 // sqrdmulh z19.s, z19.s, z20.s\n"
+ ".inst 0x04b876b5 // sqrdmulh z21.s, z21.s, z24.s\n"
+ "and z13.d, z16.d, z27.d\n"
+ "and z6.d, z8.d, z11.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z7.d, z0.d, z11.d\n"
+ ".inst 0x04b877bd // sqrdmulh z29.s, z29.s, z24.s\n"
+ ".inst 0x04b87529 // sqrdmulh z9.s, z9.s, z24.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
"asr z6.s, z6.s, #0x1f\n"
- ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
- "sqadd z22.s, z22.s, z19.s\n"
- "asr z2.s, z2.s, #0x1f\n"
- ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
- "sqadd z25.s, z25.s, z7.s\n"
- "sqadd z23.s, z23.s, z6.s\n"
- ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
- "sqadd z27.s, z27.s, z2.s\n"
- ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
- ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
- ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
- ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
- ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
- ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
- ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
- ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
- "sqadd z5.h, z5.h, z24.h\n"
- "smax z5.h, p4/M, z5.h, z11.h\n"
- "smin z5.h, p4/M, z5.h, z26.h\n"
- "sqadd z17.h, z17.h, z24.h\n"
- "sqadd z16.h, z16.h, z24.h\n"
- "smax z17.h, p4/M, z17.h, z11.h\n"
- "smax z16.h, p4/M, z16.h, z11.h\n"
- "sqadd z22.h, z22.h, z24.h\n"
- "smax z22.h, p4/M, z22.h, z11.h\n"
- "smin z17.h, p4/M, z17.h, z26.h\n"
- "st1b { z5.h }, p0, [x13, x27]\n"
- "smin z16.h, p4/M, z16.h, z26.h\n"
- "smin z22.h, p4/M, z22.h, z26.h\n"
- "st1b { z17.h }, p0, [x12, x27]\n"
- "st1b { z16.h }, p0, [x11, x27]\n"
- "st1b { z22.h }, p0, [x10, x27]\n"
- "ld1sb { z14.h }, p4/Z, [x14]\n"
- "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
- "inch x27\n"
- "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "sqadd z3.s, z3.s, z4.s\n"
+ "and z20.d, z19.d, z11.d\n"
+ "and z18.d, z21.d, z27.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z13.s\n"
+ "and z13.d, z29.d, z27.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z23.d, z9.d, z27.d\n"
+ ".inst 0x44829163 // srshl z3.s, p4/M, z3.s, z11.s\n"
+ "sqadd z8.s, z8.s, z6.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z7.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ ".inst 0x44829370 // srshl z16.s, p4/M, z16.s, z27.s\n"
+ "sqadd z19.s, z19.s, z20.s\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ ".inst 0x44829168 // srshl z8.s, p4/M, z8.s, z11.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ ".inst 0x45304063 // sqxtnb z3.h, z3.s\n"
+ ".inst 0x44829160 // srshl z0.s, p4/M, z0.s, z11.s\n"
+ "sqadd z29.s, z29.s, z13.s\n"
+ ".inst 0x44829173 // srshl z19.s, p4/M, z19.s, z11.s\n"
+ "sqadd z9.s, z9.s, z23.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x44829375 // srshl z21.s, p4/M, z21.s, z27.s\n"
+ ".inst 0x45304000 // sqxtnb z0.h, z0.s\n"
+ ".inst 0x45304603 // sqxtnt z3.h, z16.s\n"
+ ".inst 0x4482937d // srshl z29.s, p4/M, z29.s, z27.s\n"
+ ".inst 0x44829369 // srshl z9.s, p4/M, z9.s, z27.s\n"
+ ".inst 0x45304273 // sqxtnb z19.h, z19.s\n"
+ ".inst 0x453046a8 // sqxtnt z8.h, z21.s\n"
+ ".inst 0x453047a0 // sqxtnt z0.h, z29.s\n"
+ ".inst 0x45304533 // sqxtnt z19.h, z9.s\n"
+ "sqadd z3.h, z3.h, z26.h\n"
+ "sqadd z8.h, z8.h, z26.h\n"
+ "sqadd z0.h, z0.h, z26.h\n"
+ "sqadd z19.h, z19.h, z26.h\n"
+ "smax z3.h, p4/M, z3.h, z2.h\n"
+ "smax z8.h, p4/M, z8.h, z2.h\n"
+ "smax z0.h, p4/M, z0.h, z2.h\n"
+ "smax z19.h, p4/M, z19.h, z2.h\n"
+ "smin z3.h, p4/M, z3.h, z14.h\n"
+ "smin z8.h, p4/M, z8.h, z14.h\n"
+ "smin z0.h, p4/M, z0.h, z14.h\n"
+ "smin z19.h, p4/M, z19.h, z14.h\n"
+ "st1b { z3.h }, p0, [x9, x12]\n"
+ "st1b { z8.h }, p0, [x28, x12]\n"
+ "st1b { z0.h }, p0, [x27, x12]\n"
+ "st1b { z19.h }, p0, [x26, x12]\n"
+ "inch x12\n"
+ "ld1sb { z13.h }, p4/Z, [x14]\n"
+ "ld1sb { z11.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #2, MUL VL]\n"
"ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
- ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
- ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
- "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
- "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
- ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
- ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
- "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
- "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "ld1sb { z20.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z30.h }, p4/Z, [x14, #5, MUL VL]\n"
+ "ld1sb { z28.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z17.h }, p4/Z, [x14, #7, MUL VL]\n"
"inch x14, ALL, MUL #8\n"
- ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
- "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z5.s, z17.s, z16.s\n"
- "uzp2 z9.s, z17.s, z16.s\n"
- "ld1sb { z8.h }, p4/Z, [x14]\n"
- "ldp x24, x23, [x28, #0x0]\n"
+ ".inst 0x454f11ad // ssublb z13.h, z13.b, z15.b\n"
+ "ld1w { z1.s }, p2/Z, [x21]\n"
+ "ld1w { z0.s }, p1/Z, [x21, #1, MUL VL]\n"
"addvl x21, x21, #2\n"
+ ".inst 0x454f116b // ssublb z11.h, z11.b, z15.b\n"
+ ".inst 0x454f1252 // ssublb z18.h, z18.b, z15.b\n"
+ ".inst 0x454f10c6 // ssublb z6.h, z6.b, z15.b\n"
+ "ld1sb { z5.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ ".inst 0x454f1294 // ssublb z20.h, z20.b, z15.b\n"
+ ".inst 0x454f13de // ssublb z30.h, z30.b, z15.b\n"
+ "uzp1 z3.s, z1.s, z0.s\n"
+ "uzp2 z16.s, z1.s, z0.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x22, x21, [x28, #0x10]\n"
- "ldr x20, [x28, #0x20]\n"
- "mov z17.d, z5.d\n"
- "mov z25.d, z9.d\n"
- "ld1b { z0.h }, p3/Z, [x24, x16]\n"
- "ld1b { z29.h }, p3/Z, [x23, x16]\n"
- "mov z16.d, z5.d\n"
- "mov z23.d, z9.d\n"
- "ld1b { z4.h }, p3/Z, [x22, x16]\n"
- "ld1b { z13.h }, p3/Z, [x21, x16]\n"
- "mov z22.d, z5.d\n"
- "mov z27.d, z9.d\n"
- "ld1b { z20.h }, p3/Z, [x20, x16]\n"
- ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
- ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
- ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
- ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
- ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
- ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
- ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
- ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
- ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ ".inst 0x454f139c // ssublb z28.h, z28.b, z15.b\n"
+ ".inst 0x454f1231 // ssublb z17.h, z17.b, z15.b\n"
+ ".inst 0x454f10a5 // ssublb z5.h, z5.b, z15.b\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x17]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x17]\n"
+ "ld1b { z4.h }, p3/Z, [x22, x17]\n"
+ "mov z8.d, z3.d\n"
+ "mov z21.d, z16.d\n"
+ "ld1b { z1.h }, p3/Z, [x21, x17]\n"
+ "mov z0.d, z3.d\n"
+ "mov z29.d, z16.d\n"
+ "ld1b { z27.h }, p3/Z, [x20, x17]\n"
+ "mov z19.d, z3.d\n"
+ "mov z9.d, z16.d\n"
+ ".inst 0x454a18e7 // usublb z7.h, z7.b, z10.b\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x454a1884 // usublb z4.h, z4.b, z10.b\n"
+ ".inst 0x454a1821 // usublb z1.h, z1.b, z10.b\n"
+ ".inst 0x454a1b7b // usublb z27.h, z27.b, z10.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 754d06d443..7d2106ad08 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[25];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -100,348 +100,348 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "mov x7, #0x0\n"
- "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "mov x8, #0x0\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
- "mov x23, x7\n"
- "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
- "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
- "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
- "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
- "ld1rb { z26.b }, p4/Z, [x21]\n"
- "ld1rb { z13.b }, p4/Z, [x20]\n"
- "add x21, x25, %[offsetof_Requantize32_minval]\n"
- "add x20, x25, %[offsetof_Requantize32_maxval]\n"
- "ld1rh { z19.h }, p4/Z, [x22]\n"
- "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x8\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z17.b }, p4/Z, [x20]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z25.h }, p4/Z, [x22]\n"
+ "ld1rh { z14.h }, p4/Z, [x21]\n"
"ld1rh { z9.h }, p4/Z, [x20]\n"
- "ldp x16, x15, [x24, #0x0]\n"
- "incw x23\n"
- "whilelt p3.h, x7, x8\n"
- "ldp x14, x13, [x24, #0x10]\n"
- "whilelt p2.s, x7, x8\n"
- "whilelt p1.s, x23, x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z25.h }, p4/Z, [x17]\n"
- "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "add x11, %x[params], %[offsetof_Params_inptrs]\n"
- "mov x10, #0x0\n"
- "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
- "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x12]\n"
- "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1sb { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
- "addvl x12, x12, #2\n"
- "mov z18.d, z8.d\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z0.d, z24.d\n"
- "mov z15.d, z8.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z21.h }, p3/Z, [x27, x7]\n"
- "mov z1.d, z24.d\n"
- "mov z5.d, z8.d\n"
- "ld1b { z22.h }, p3/Z, [x26, x7]\n"
- "ld1b { z11.h }, p3/Z, [x25, x7]\n"
- "mov z6.d, z24.d\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- "ld1b { z20.h }, p3/Z, [x24, x7]\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
- ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- "ld1b { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "incw x24\n"
+ "whilelt p3.h, x8, x17\n"
+ "ldp x11, x10, [x26, #0x0]\n"
+ "ldp x9, x28, [x26, #0x10]\n"
+ "whilelt p2.s, x8, x17\n"
+ "whilelt p1.s, x24, x17\n"
+ "ld1sb { z28.h }, p4/Z, [x16]\n"
+ "ld1sb { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1sb { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1sb { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1w { z11.s }, p2/Z, [x25]\n"
+ "ld1w { z4.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ld1sb { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z11.s, z4.s\n"
+ "uzp2 z11.s, z11.s, z4.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c12b5 // ssublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1b { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1b { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1b { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511a73 // usublb z19.h, z19.b, z17.b\n"
"1:" // Loop
- ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
- "ldr x21, [x11, #0x58]\n"
- "ldr x20, [x11, #0x78]\n"
- ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
- ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
- "ld1b { z17.h }, p3/Z, [x21, x7]\n"
- "ld1b { z29.h }, p3/Z, [x20, x7]\n"
- ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
- "ldr x21, [x11, #0x60]\n"
- "ldr x20, [x11, #0x80]\n"
- ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
- ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
- ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
- ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
- ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
- "ld1b { z22.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
- ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
- ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
- "ldr x21, [x11, #0x68]\n"
- ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
- "ld1b { z21.h }, p3/Z, [x20, x7]\n"
- "ldr x20, [x11, #0x88]\n"
- ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
- ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
- ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- "ldr x22, [x11, #0x40]\n"
- ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
- "ld1b { z11.h }, p3/Z, [x21, x7]\n"
- ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
- "ld1b { z20.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
- "ldr x21, [x11, #0x70]\n"
- ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
- "ldr x20, [x11, #0x98]\n"
- ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
- "ldr x23, [x11, #0x50]\n"
- ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- "ld1b { z17.h }, p3/Z, [x22, x7]\n"
- ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
- "ld1b { z29.h }, p3/Z, [x21, x7]\n"
- "ld1b { z28.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
- "ldr x22, [x11, #0x48]\n"
- ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
- ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
- "ldr x21, [x11, #0x90]\n"
- "ldr x20, [x11, #0xa8]\n"
- ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
- ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
- ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
- ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
- ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
- ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
- "ld1b { z16.h }, p3/Z, [x22, x7]\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
- "ld1b { z25.h }, p3/Z, [x21, x7]\n"
- ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
- "ld1b { z11.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
- "ldr x21, [x11, #0xa0]\n"
- "ldr x20, [x11, #0xb0]\n"
- ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
- ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
- ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
- "ld1b { z20.h }, p3/Z, [x21, x7]\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
- ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
- ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
- "ldr x20, [x11, #0xb8]\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
- ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
- "ld1b { z30.h }, p3/Z, [x20, x7]\n"
- ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
- ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
- "ldr x20, [x11, #0xc0]\n"
- "ld1w { z17.s }, p2/Z, [x9]\n"
- ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
- ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
- "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
- ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
- ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
- ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
- "ld1b { z25.h }, p3/Z, [x20, x7]\n"
- "uzp1 z10.s, z17.s, z14.s\n"
- ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
- ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
- "uzp2 z14.s, z17.s, z14.s\n"
- "ld1w { z17.s }, p2/Z, [x28]\n"
- ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
- ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
- "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
- ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
- ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
- ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
- "uzp1 z4.s, z17.s, z16.s\n"
- "inch x7\n"
- ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
- ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
- ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
- "whilelt p0.h, x10, x8\n"
- ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
- ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
- "uzp2 z22.s, z17.s, z16.s\n"
- "mov x20, x7\n"
- ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
- ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
- "and z17.d, z8.d, z4.d\n"
- "inch x17\n"
- ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
- ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
- ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
- "incw x20\n"
- ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
- ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
- "asr z17.s, z17.s, #0x1f\n"
- "whilelt p2.s, x7, x8\n"
- ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
- ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
- "and z16.d, z24.d, z22.d\n"
- "whilelt p1.s, x20, x8\n"
- ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
- ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
- ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ ".inst 0x448f4065 // smlalb z5.s, p4/M, z3.h, z15.h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x78]\n"
+ ".inst 0x448f446b // smlalt z11.s, p4/M, z3.h, z15.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x80]\n"
+ ".inst 0x449a407e // smlalb z30.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448d4064 // smlalb z4.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c407f // smlalb z31.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449a4470 // smlalt z16.s, p4/M, z3.h, z26.h\n"
+ "ldr x21, [x15, #0x68]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "ld1b { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d4468 // smlalt z8.s, p4/M, z3.h, z13.h\n"
+ ".inst 0x449c446a // smlalt z10.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c43a5 // smlalb z5.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x449c47ab // smlalt z11.s, p4/M, z29.h, z28.h\n"
+ "ld1b { z29.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z3.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x4494401e // smlalb z30.s, p4/M, z0.h, z20.h\n"
+ "ldr x25, [x15, #0x40]\n"
+ "ldr x24, [x15, #0x70]\n"
+ "whilelt p0.h, x14, x17\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ ".inst 0x455118e7 // usublb z7.h, z7.b, z17.b\n"
+ ".inst 0x44944410 // smlalt z16.s, p4/M, z0.h, z20.h\n"
+ "ld1b { z0.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ "ldr x23, [x15, #0x98]\n"
+ "ldr x22, [x15, #0x50]\n"
+ ".inst 0x449442e5 // smlalb z5.s, p4/M, z23.h, z20.h\n"
+ ".inst 0x449446eb // smlalt z11.s, p4/M, z23.h, z20.h\n"
+ "ld1b { z23.h }, p3/Z, [x20, x8]\n"
+ "ldr x21, [x15, #0x48]\n"
+ ".inst 0x44924024 // smlalb z4.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448640ff // smlalb z31.s, p4/M, z7.h, z6.h\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ "ldr x20, [x15, #0x90]\n"
+ ".inst 0x44924428 // smlalt z8.s, p4/M, z1.h, z18.h\n"
+ ".inst 0x448644ea // smlalt z10.s, p4/M, z7.h, z6.h\n"
+ "ld1b { z1.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z7.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448d431e // smlalb z30.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x448d4710 // smlalt z16.s, p4/M, z24.h, z13.h\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x449242c5 // smlalb z5.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246cb // smlalt z11.s, p4/M, z22.h, z18.h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ ".inst 0x449c43a4 // smlalb z4.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494407f // smlalb z31.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ "ldr x23, [x15, #0xa0]\n"
+ ".inst 0x449c47a8 // smlalt z8.s, p4/M, z29.h, z28.h\n"
+ ".inst 0x4494446a // smlalt z10.s, p4/M, z3.h, z20.h\n"
+ ".inst 0x455118e7 // usublb z7.h, z7.b, z17.b\n"
+ "ldr x22, [x15, #0xb0]\n"
+ ".inst 0x449c427e // smlalb z30.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x449c4670 // smlalt z16.s, p4/M, z19.h, z28.h\n"
+ "ld1b { z28.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x44864365 // smlalb z5.s, p4/M, z27.h, z6.h\n"
+ ".inst 0x4486476b // smlalt z11.s, p4/M, z27.h, z6.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x44864004 // smlalb z4.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x448242ff // smlalb z31.s, p4/M, z23.h, z2.h\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ ".inst 0x44864408 // smlalt z8.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ ".inst 0x448246ea // smlalt z10.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n"
+ ".inst 0x4486403e // smlalb z30.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ "ld1b { z23.h }, p3/Z, [x23, x8]\n"
+ ".inst 0x44864430 // smlalt z16.s, p4/M, z1.h, z6.h\n"
+ ".inst 0x448d4265 // smlalb z5.s, p4/M, z19.h, z13.h\n"
+ ".inst 0x448d466b // smlalt z11.s, p4/M, z19.h, z13.h\n"
+ "ld1b { z6.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z1.h }, p3/Z, [x21, x8]\n"
+ ".inst 0x449440e4 // smlalb z4.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d431f // smlalb z31.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ "ld1w { z19.s }, p2/Z, [x13]\n"
+ ".inst 0x449444e8 // smlalt z8.s, p4/M, z7.h, z20.h\n"
+ ".inst 0x448d470a // smlalt z10.s, p4/M, z24.h, z13.h\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ "ld1w { z20.s }, p1/Z, [x13, #1, MUL VL]\n"
+ ".inst 0x4482439e // smlalb z30.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x455118c6 // usublb z6.h, z6.b, z17.b\n"
+ ".inst 0x44824790 // smlalt z16.s, p4/M, z28.h, z2.h\n"
+ "ld1b { z13.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x448242c5 // smlalb z5.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x448246cb // smlalt z11.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x45511821 // usublb z1.h, z1.b, z17.b\n"
+ "inch x8\n"
+ ".inst 0x449a4364 // smlalb z4.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492401f // smlalb z31.s, p4/M, z0.h, z18.h\n"
+ "uzp1 z28.s, z19.s, z20.s\n"
+ "inch x16\n"
+ ".inst 0x449a4768 // smlalt z8.s, p4/M, z27.h, z26.h\n"
+ ".inst 0x4492440a // smlalt z10.s, p4/M, z0.h, z18.h\n"
+ "uzp2 z20.s, z19.s, z20.s\n"
+ "ld1w { z27.s }, p2/Z, [x12]\n"
+ ".inst 0x449242de // smlalb z30.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x449246d0 // smlalt z16.s, p4/M, z22.h, z18.h\n"
+ "ld1w { z19.s }, p1/Z, [x12, #1, MUL VL]\n"
+ ".inst 0x455119ad // usublb z13.h, z13.b, z17.b\n"
+ ".inst 0x449a43a5 // smlalb z5.s, p4/M, z29.h, z26.h\n"
+ ".inst 0x449a47ab // smlalt z11.s, p4/M, z29.h, z26.h\n"
+ "mov x21, x8\n"
+ "whilelt p2.s, x8, x17\n"
+ ".inst 0x449542e4 // smlalb z4.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449540df // smlalb z31.s, p4/M, z6.h, z21.h\n"
"ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
- ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
- "whilelt p3.h, x7, x8\n"
- "addvl x9, x9, #2\n"
- ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
- "sqadd z8.s, z8.s, z17.s\n"
- ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
- "addvl x28, x28, #2\n"
- "asr z16.s, z16.s, #0x1f\n"
- "and z21.d, z18.d, z4.d\n"
- ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
- "and z20.d, z15.d, z4.d\n"
- ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
- "and z28.d, z5.d, z4.d\n"
- ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
- "sqadd z24.s, z24.s, z16.s\n"
- ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- "and z25.d, z0.d, z22.d\n"
- "asr z20.s, z20.s, #0x1f\n"
- "and z17.d, z1.d, z22.d\n"
- "asr z28.s, z28.s, #0x1f\n"
- "and z16.d, z6.d, z22.d\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "asr z25.s, z25.s, #0x1f\n"
- ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
- "sqadd z15.s, z15.s, z20.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
- "sqadd z5.s, z5.s, z28.s\n"
- "asr z16.s, z16.s, #0x1f\n"
- ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
- "sqadd z0.s, z0.s, z25.s\n"
- "sqadd z1.s, z1.s, z17.s\n"
- ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
- "sqadd z6.s, z6.s, z16.s\n"
- ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
- ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
- ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
- ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ "addvl x13, x13, #2\n"
+ ".inst 0x449546e8 // smlalt z8.s, p4/M, z23.h, z21.h\n"
+ ".inst 0x449544ca // smlalt z10.s, p4/M, z6.h, z21.h\n"
+ "uzp1 z23.s, z27.s, z19.s\n"
+ "addvl x12, x12, #2\n"
+ ".inst 0x4495407e // smlalb z30.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44954470 // smlalt z16.s, p4/M, z3.h, z21.h\n"
+ "uzp2 z6.s, z27.s, z19.s\n"
+ "incw x21\n"
+ ".inst 0x449540e5 // smlalb z5.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x449544eb // smlalt z11.s, p4/M, z7.h, z21.h\n"
+ ".inst 0x44824004 // smlalb z4.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a403f // smlalb z31.s, p4/M, z1.h, z26.h\n"
+ ".inst 0x44824408 // smlalt z8.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x449a442a // smlalt z10.s, p4/M, z1.h, z26.h\n"
+ "whilelt p1.s, x21, x17\n"
+ "whilelt p3.h, x8, x17\n"
+ ".inst 0x448f431e // smlalb z30.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x448f4710 // smlalt z16.s, p4/M, z24.h, z15.h\n"
+ ".inst 0x04bc74a5 // sqrdmulh z5.s, z5.s, z28.s\n"
+ ".inst 0x04b4756b // sqrdmulh z11.s, z11.s, z20.s\n"
+ ".inst 0x448f4024 // smlalb z4.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f41bf // smlalb z31.s, p4/M, z13.h, z15.h\n"
+ "and z24.d, z5.d, z23.d\n"
+ ".inst 0x448f4428 // smlalt z8.s, p4/M, z1.h, z15.h\n"
+ ".inst 0x448f45aa // smlalt z10.s, p4/M, z13.h, z15.h\n"
+ "and z19.d, z11.d, z6.d\n"
+ ".inst 0x04bc77de // sqrdmulh z30.s, z30.s, z28.s\n"
+ ".inst 0x04b47610 // sqrdmulh z16.s, z16.s, z20.s\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ ".inst 0x04bc7484 // sqrdmulh z4.s, z4.s, z28.s\n"
+ ".inst 0x04bc77ff // sqrdmulh z31.s, z31.s, z28.s\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z7.d, z30.d, z23.d\n"
+ "sqadd z5.s, z5.s, z24.s\n"
+ ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
+ "and z15.d, z4.d, z23.d\n"
+ "and z24.d, z31.d, z23.d\n"
+ ".inst 0x04b4754a // sqrdmulh z10.s, z10.s, z20.s\n"
+ "sqadd z11.s, z11.s, z19.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z18.d, z16.d, z6.d\n"
+ ".inst 0x448292e5 // srshl z5.s, p4/M, z5.s, z23.s\n"
+ "asr z15.s, z15.s, #0x1f\n"
+ "and z13.d, z8.d, z6.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "and z3.d, z10.d, z6.d\n"
+ ".inst 0x448290cb // srshl z11.s, p4/M, z11.s, z6.s\n"
+ "sqadd z30.s, z30.s, z7.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z4.s, z4.s, z15.s\n"
+ "asr z13.s, z13.s, #0x1f\n"
+ "sqadd z31.s, z31.s, z24.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x448292fe // srshl z30.s, p4/M, z30.s, z23.s\n"
+ "sqadd z16.s, z16.s, z18.s\n"
".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
- ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
- ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
- ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
- ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
- "sqadd z8.h, z8.h, z19.h\n"
- "smax z8.h, p4/M, z8.h, z12.h\n"
- "smin z8.h, p4/M, z8.h, z9.h\n"
- "sqadd z18.h, z18.h, z19.h\n"
- "sqadd z15.h, z15.h, z19.h\n"
- "smax z18.h, p4/M, z18.h, z12.h\n"
- "smax z15.h, p4/M, z15.h, z12.h\n"
- "sqadd z5.h, z5.h, z19.h\n"
- "smax z5.h, p4/M, z5.h, z12.h\n"
- "smin z18.h, p4/M, z18.h, z9.h\n"
- "st1b { z8.h }, p0, [x16, x10]\n"
- "smin z15.h, p4/M, z15.h, z9.h\n"
+ ".inst 0x448292e4 // srshl z4.s, p4/M, z4.s, z23.s\n"
+ "sqadd z8.s, z8.s, z13.s\n"
+ ".inst 0x448292ff // srshl z31.s, p4/M, z31.s, z23.s\n"
+ "sqadd z10.s, z10.s, z3.s\n"
+ ".inst 0x453043de // sqxtnb z30.h, z30.s\n"
+ ".inst 0x448290d0 // srshl z16.s, p4/M, z16.s, z6.s\n"
+ ".inst 0x45304084 // sqxtnb z4.h, z4.s\n"
+ ".inst 0x45304565 // sqxtnt z5.h, z11.s\n"
+ ".inst 0x448290c8 // srshl z8.s, p4/M, z8.s, z6.s\n"
+ ".inst 0x448290ca // srshl z10.s, p4/M, z10.s, z6.s\n"
+ ".inst 0x453043ff // sqxtnb z31.h, z31.s\n"
+ ".inst 0x4530461e // sqxtnt z30.h, z16.s\n"
+ ".inst 0x45304504 // sqxtnt z4.h, z8.s\n"
+ ".inst 0x4530455f // sqxtnt z31.h, z10.s\n"
+ "sqadd z5.h, z5.h, z25.h\n"
+ "sqadd z30.h, z30.h, z25.h\n"
+ "sqadd z4.h, z4.h, z25.h\n"
+ "sqadd z31.h, z31.h, z25.h\n"
+ "smax z5.h, p4/M, z5.h, z14.h\n"
+ "smax z30.h, p4/M, z30.h, z14.h\n"
+ "smax z4.h, p4/M, z4.h, z14.h\n"
+ "smax z31.h, p4/M, z31.h, z14.h\n"
"smin z5.h, p4/M, z5.h, z9.h\n"
- "st1b { z18.h }, p0, [x15, x10]\n"
- "st1b { z15.h }, p0, [x14, x10]\n"
- "st1b { z5.h }, p0, [x13, x10]\n"
- "ld1sb { z25.h }, p4/Z, [x17]\n"
- "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
- "inch x10\n"
- "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
- "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
- ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
- ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
- "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
- "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
- ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
- ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
- "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
- "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
- "inch x17, ALL, MUL #8\n"
- ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
- "ld1w { z17.s }, p2/Z, [x20]\n"
- "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
- "uzp1 z8.s, z17.s, z16.s\n"
- "uzp2 z24.s, z17.s, z16.s\n"
- "ld1sb { z2.h }, p4/Z, [x17]\n"
- "ldp x27, x26, [x11, #0x0]\n"
+ "smin z30.h, p4/M, z30.h, z9.h\n"
+ "smin z4.h, p4/M, z4.h, z9.h\n"
+ "smin z31.h, p4/M, z31.h, z9.h\n"
+ "st1b { z5.h }, p0, [x11, x14]\n"
+ "st1b { z30.h }, p0, [x10, x14]\n"
+ "st1b { z4.h }, p0, [x9, x14]\n"
+ "st1b { z31.h }, p0, [x28, x14]\n"
+ "inch x14\n"
+ "ld1sb { z28.h }, p4/Z, [x16]\n"
+ "ld1sb { z20.h }, p4/Z, [x16, #1, MUL VL]\n"
+ "ld1sb { z13.h }, p4/Z, [x16, #2, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x16, #3, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x16, #4, MUL VL]\n"
+ "ld1sb { z2.h }, p4/Z, [x16, #5, MUL VL]\n"
+ "ld1sb { z26.h }, p4/Z, [x16, #6, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x16, #7, MUL VL]\n"
+ "inch x16, ALL, MUL #8\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "ld1w { z1.s }, p1/Z, [x20, #1, MUL VL]\n"
"addvl x20, x20, #2\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ld1sb { z15.h }, p4/Z, [x16]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ "uzp1 z5.s, z10.s, z1.s\n"
+ "uzp2 z11.s, z10.s, z1.s\n"
"str x20, [%x[params], %[offsetof_Params_bias]]\n"
- "ldp x25, x24, [x11, #0x10]\n"
- "ldp x23, x22, [x11, #0x20]\n"
- "mov z18.d, z8.d\n"
- "mov z0.d, z24.d\n"
- "ldp x21, x20, [x11, #0x30]\n"
- "ld1b { z21.h }, p3/Z, [x27, x7]\n"
- "mov z15.d, z8.d\n"
- "mov z1.d, z24.d\n"
- "ld1b { z22.h }, p3/Z, [x26, x7]\n"
- "ld1b { z11.h }, p3/Z, [x25, x7]\n"
- "mov z5.d, z8.d\n"
- "mov z6.d, z24.d\n"
- "ld1b { z20.h }, p3/Z, [x24, x7]\n"
- "ld1b { z27.h }, p3/Z, [x23, x7]\n"
- ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
- ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
- "ld1b { z28.h }, p3/Z, [x22, x7]\n"
- "ld1b { z16.h }, p3/Z, [x21, x7]\n"
- ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
- ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
- "ld1b { z31.h }, p3/Z, [x20, x7]\n"
- ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
- ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
- ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
- ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
- ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
- ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
- ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
- ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x454c12b5 // ssublb z21.h, z21.b, z12.b\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "mov z30.d, z5.d\n"
+ "mov z16.d, z11.d\n"
+ "mov z4.d, z5.d\n"
+ "mov z8.d, z11.d\n"
+ "mov z31.d, z5.d\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov z10.d, z11.d\n"
+ "ld1b { z3.h }, p3/Z, [x27, x8]\n"
+ "ld1b { z29.h }, p3/Z, [x26, x8]\n"
+ "ld1b { z23.h }, p3/Z, [x25, x8]\n"
+ "ld1b { z0.h }, p3/Z, [x24, x8]\n"
+ "ld1b { z24.h }, p3/Z, [x23, x8]\n"
+ "ld1b { z22.h }, p3/Z, [x22, x8]\n"
+ "ld1b { z27.h }, p3/Z, [x21, x8]\n"
+ "ld1b { z19.h }, p3/Z, [x20, x8]\n"
+ ".inst 0x45511863 // usublb z3.h, z3.b, z17.b\n"
+ ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n"
+ ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n"
+ ".inst 0x45511800 // usublb z0.h, z0.b, z17.b\n"
+ ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n"
+ ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n"
+ ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n"
+ ".inst 0x45511a73 // usublb z19.h, z19.b, z17.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index f24a258484..c7c4c86b20 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
{
struct Params
{
- long unsigned int n_channels;
+ uint64_t n_channels;
const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
@@ -55,7 +55,7 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
const uint8_t *inptrs[36];
Params(
- long unsigned int n_channels,
+ uint64_t n_channels,
const uint8_t *const *inptrs_raw,
const void *const weights,
const int32_t *const bias,
@@ -112,533 +112,533 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
__asm__ __volatile__(
"mov x2, #0x0\n"
- "mov x24, x2\n"
- "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
- "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x27, [%x[params], %[offsetof_Params_requant]]\n"
"ptrue p4.b\n"
- "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
- "incw x24\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_outptrs]]\n"
"ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
- "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
- "ld1rb { z30.b }, p4/Z, [x21]\n"
- "ld1rb { z10.b }, p4/Z, [x20]\n"
- "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
- "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x6, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "mov x24, x2\n"
+ "add x20, x27, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x27, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x27, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z14.b }, p4/Z, [x20]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "add x21, x27, %[offsetof_Requantize32_minval]\n"
+ "add x20, x27, %[offsetof_Requantize32_maxval]\n"
+ "ld1rb { z12.b }, p4/Z, [x23]\n"
+ "ld1rh { z10.h }, p4/Z, [x22]\n"
+ "incw x24\n"
"ld1rh { z15.h }, p4/Z, [x21]\n"
- "ld1rh { z12.h }, p4/Z, [x20]\n"
- "add x20, x23, %[offsetof_Requantize32_maxval]\n"
"ld1rh { z13.h }, p4/Z, [x20]\n"
- "ldp x5, x6, [x22, #0x0]\n"
"whilelt p3.h, x2, x3\n"
- "ldp x7, x8, [x22, #0x10]\n"
+ "ldp x17, x16, [x26, #0x0]\n"
+ "ldp x15, x14, [x26, #0x10]\n"
"whilelt p2.s, x2, x3\n"
"whilelt p1.s, x24, x3\n"
- "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
- "add x17, %x[params], %[offsetof_Params_inptrs]\n"
- "ld1w { z17.s }, p2/Z, [x10]\n"
- "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1sb { z26.h }, p4/Z, [x4]\n"
- "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
- "addvl x10, x10, #2\n"
- "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "mov x16, #0x0\n"
- "mov z6.d, z14.d\n"
- "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z18.d, z23.d\n"
- "mov z9.d, z14.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z20.d, z23.d\n"
- "mov z7.d, z14.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z1.d, z23.d\n"
- ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
- "ld1b { z22.h }, p3/Z, [x9, x2]\n"
- "ld1b { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- "ld1b { z11.h }, p3/Z, [x27, x2]\n"
- "ld1b { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
- "ld1b { z29.h }, p3/Z, [x25, x2]\n"
- "ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x2]\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- "ld1b { z19.h }, p3/Z, [x21, x2]\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "str x10, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "ld1w { z5.s }, p2/Z, [x25]\n"
+ "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "addvl x25, x25, #2\n"
+ "ld1sb { z25.h }, p4/Z, [x4]\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z5.s, z16.s\n"
+ "uzp2 z30.s, z5.s, z16.s\n"
+ "str x25, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1b { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1b { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
"1:" // Loop
- ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
- ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
- "ldr x20, [x17, #0x50]\n"
- "ld1b { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
- "ldr x20, [x17, #0x58]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
- ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
- "ld1b { z5.h }, p3/Z, [x20, x2]\n"
- "ldr x20, [x17, #0x60]\n"
- ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
- ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
- ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
- "ld1b { z22.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
- ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
- "ldr x20, [x17, #0x68]\n"
+ ".inst 0x44994346 // smlalb z6.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x4499475e // smlalt z30.s, p4/M, z26.h, z25.h\n"
+ "ldr x23, [x5, #0x50]\n"
+ "ldr x22, [x5, #0x58]\n"
+ ".inst 0x44994211 // smlalb z17.s, p4/M, z16.h, z25.h\n"
+ ".inst 0x44994315 // smlalb z21.s, p4/M, z24.h, z25.h\n"
+ "ldr x21, [x5, #0x60]\n"
+ "ld1sb { z0.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x449940a7 // smlalb z7.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994608 // smlalt z8.s, p4/M, z16.h, z25.h\n"
+ "ldr x20, [x5, #0x68]\n"
+ "ld1sb { z26.h }, p4/Z, [x4, #6, MUL VL]\n"
+ "ld1b { z2.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x4499471b // smlalt z27.s, p4/M, z24.h, z25.h\n"
+ ".inst 0x449944a9 // smlalt z9.s, p4/M, z5.h, z25.h\n"
+ "ld1b { z22.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x449c4206 // smlalb z6.s, p4/M, z16.h, z28.h\n"
+ ".inst 0x449c461e // smlalt z30.s, p4/M, z16.h, z28.h\n"
+ "ld1b { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x449c4251 // smlalb z17.s, p4/M, z18.h, z28.h\n"
+ ".inst 0x449c40b5 // smlalb z21.s, p4/M, z5.h, z28.h\n"
+ "ld1b { z16.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
+ ".inst 0x449c4067 // smlalb z7.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e1842 // usublb z2.h, z2.b, z14.b\n"
+ ".inst 0x449c4648 // smlalt z8.s, p4/M, z18.h, z28.h\n"
+ "ldr x20, [x5, #0x70]\n"
+ ".inst 0x449c44bb // smlalt z27.s, p4/M, z5.h, z28.h\n"
+ ".inst 0x449c4469 // smlalt z9.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x454e1ad6 // usublb z22.h, z22.b, z14.b\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x44844246 // smlalb z6.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x4484465e // smlalt z30.s, p4/M, z18.h, z4.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44844271 // smlalb z17.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x44844075 // smlalb z21.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z25.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44844047 // smlalb z7.s, p4/M, z2.h, z4.h\n"
+ ".inst 0x44844668 // smlalt z8.s, p4/M, z19.h, z4.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ldr x20, [x5, #0x78]\n"
+ ".inst 0x4484447b // smlalt z27.s, p4/M, z3.h, z4.h\n"
+ ".inst 0x44844449 // smlalt z9.s, p4/M, z2.h, z4.h\n"
+ "ld1sb { z18.h }, p4/Z, [x4]\n"
+ "ldr x22, [x5, #0x80]\n"
+ ".inst 0x44974266 // smlalb z6.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x4497467e // smlalt z30.s, p4/M, z19.h, z23.h\n"
+ ".inst 0x454e1b39 // usublb z25.h, z25.b, z14.b\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974055 // smlalb z21.s, p4/M, z2.h, z23.h\n"
+ "ld1b { z19.h }, p3/Z, [x20, x2]\n"
+ "ldr x21, [x5, #0x88]\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x454c1252 // ssublb z18.h, z18.b, z12.b\n"
+ "ldr x20, [x5, #0x90]\n"
+ ".inst 0x4497445b // smlalt z27.s, p4/M, z2.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1b { z23.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x449f4166 // smlalb z6.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x449f457e // smlalt z30.s, p4/M, z11.h, z31.h\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x449f4031 // smlalb z17.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x449f42d5 // smlalb z21.s, p4/M, z22.h, z31.h\n"
+ "ldr x23, [x5, #0x98]\n"
+ "ldr x22, [x5, #0xa0]\n"
+ ".inst 0x449f4287 // smlalb z7.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x449f4428 // smlalt z8.s, p4/M, z1.h, z31.h\n"
+ ".inst 0x454e1af7 // usublb z23.h, z23.b, z14.b\n"
+ "ld1b { z1.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x449f46db // smlalt z27.s, p4/M, z22.h, z31.h\n"
+ ".inst 0x449f4689 // smlalt z9.s, p4/M, z20.h, z31.h\n"
+ ".inst 0x454c116b // ssublb z11.h, z11.b, z12.b\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n"
+ ".inst 0x4480471e // smlalt z30.s, p4/M, z24.h, z0.h\n"
+ "ld1b { z24.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x448043b5 // smlalb z21.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ "ldr x21, [x5, #0xb0]\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldr x13, [x5, #0xb8]\n"
+ ".inst 0x448047bb // smlalt z27.s, p4/M, z29.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ "ld1b { z0.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ "ld1sb { z5.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldr x12, [x5, #0xc0]\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "ldr x10, [x5, #0xd0]\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x454e1800 // usublb z0.h, z0.b, z14.b\n"
+ "ldr x9, [x5, #0xd8]\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ "ld1b { z26.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ "ld1sb { z3.h }, p4/Z, [x4, #5, MUL VL]\n"
+ "ldr x28, [x5, #0xe0]\n"
+ ".inst 0x449c4051 // smlalb z17.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "ldr x26, [x5, #0xf0]\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4448 // smlalt z8.s, p4/M, z2.h, z28.h\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ "ldr x25, [x5, #0xf8]\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ ".inst 0x44924046 // smlalb z6.s, p4/M, z2.h, z18.h\n"
+ ".inst 0x4492445e // smlalt z30.s, p4/M, z2.h, z18.h\n"
"ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
- ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- "ld1b { z26.h }, p3/Z, [x20, x2]\n"
- ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
- ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- "ldr x20, [x17, #0x70]\n"
- ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
- ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
- "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
- ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
- ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
- ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "ldr x24, [x5, #0x100]\n"
+ ".inst 0x449242d1 // smlalb z17.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x44924275 // smlalb z21.s, p4/M, z19.h, z18.h\n"
+ "ldr x23, [x5, #0x108]\n"
+ "ldr x22, [x5, #0x110]\n"
+ ".inst 0x449242e7 // smlalb z7.s, p4/M, z23.h, z18.h\n"
+ ".inst 0x449246c8 // smlalt z8.s, p4/M, z22.h, z18.h\n"
+ ".inst 0x454e1b9c // usublb z28.h, z28.b, z14.b\n"
+ "ldr x20, [x5, #0x118]\n"
+ ".inst 0x4492467b // smlalt z27.s, p4/M, z19.h, z18.h\n"
+ ".inst 0x449246e9 // smlalt z9.s, p4/M, z23.h, z18.h\n"
+ "ld1b { z18.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n"
+ ".inst 0x448446de // smlalt z30.s, p4/M, z22.h, z4.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- "ld1b { z8.h }, p3/Z, [x20, x2]\n"
- ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
- ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
- ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
- "ldr x20, [x17, #0x78]\n"
- ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
- ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
- "ld1sb { z24.h }, p4/Z, [x4]\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
- ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
- "ld1b { z31.h }, p3/Z, [x20, x2]\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
- ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
- "ldr x22, [x17, #0x80]\n"
+ ".inst 0x44844291 // smlalb z17.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x448442f5 // smlalb z21.s, p4/M, z23.h, z4.h\n"
+ "whilelt p0.h, x6, x3\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44844027 // smlalb z7.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x44844688 // smlalt z8.s, p4/M, z20.h, z4.h\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x448446fb // smlalt z27.s, p4/M, z23.h, z4.h\n"
+ ".inst 0x44844429 // smlalt z9.s, p4/M, z1.h, z4.h\n"
+ ".inst 0x454c12d6 // ssublb z22.h, z22.b, z12.b\n"
+ "ld1sb { z4.h }, p4/Z, [x4]\n"
+ ".inst 0x448b43a6 // smlalb z6.s, p4/M, z29.h, z11.h\n"
+ ".inst 0x448b47be // smlalt z30.s, p4/M, z29.h, z11.h\n"
+ "ld1b { z29.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x448b4211 // smlalb z17.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x448b4315 // smlalb z21.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x448b4007 // smlalb z7.s, p4/M, z0.h, z11.h\n"
+ ".inst 0x448b4608 // smlalt z8.s, p4/M, z16.h, z11.h\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448b471b // smlalt z27.s, p4/M, z24.h, z11.h\n"
+ ".inst 0x448b4409 // smlalt z9.s, p4/M, z0.h, z11.h\n"
+ "ld1b { z11.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
+ ".inst 0x449f4206 // smlalb z6.s, p4/M, z16.h, z31.h\n"
+ ".inst 0x449f461e // smlalt z30.s, p4/M, z16.h, z31.h\n"
"ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
- ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- "ldr x21, [x17, #0x88]\n"
- ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
- ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- "ldr x20, [x17, #0x90]\n"
- ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
- ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
- "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
- ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
- "ldr x23, [x17, #0x98]\n"
- "ldr x22, [x17, #0xa0]\n"
- ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
- ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
- "ld1b { z11.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
- ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
- "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
- ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
- ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
- "ld1b { z17.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
- ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
- "ldr x20, [x17, #0xa8]\n"
- "ldr x21, [x17, #0xb0]\n"
- ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
- ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
- "ldr x13, [x17, #0xb8]\n"
- "ldr x12, [x17, #0xc0]\n"
- ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
- ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
- "ld1b { z3.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
- ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
- ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
- "ldr x11, [x17, #0xc8]\n"
- "ldr x10, [x17, #0xd0]\n"
- ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
- ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
- "ldr x9, [x17, #0xd8]\n"
- "ldr x28, [x17, #0xe0]\n"
- ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
- ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
- "ld1b { z4.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
- ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449f4331 // smlalb z17.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x449f4015 // smlalb z21.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4347 // smlalb z7.s, p4/M, z26.h, z31.h\n"
+ ".inst 0x449f4728 // smlalt z8.s, p4/M, z25.h, z31.h\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x449f441b // smlalt z27.s, p4/M, z0.h, z31.h\n"
+ ".inst 0x449f4749 // smlalt z9.s, p4/M, z26.h, z31.h\n"
+ "ld1b { z31.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x454c1210 // ssublb z16.h, z16.b, z12.b\n"
+ ".inst 0x44854326 // smlalb z6.s, p4/M, z25.h, z5.h\n"
+ ".inst 0x4485473e // smlalt z30.s, p4/M, z25.h, z5.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x44854271 // smlalb z17.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x44854355 // smlalb z21.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854387 // smlalb z7.s, p4/M, z28.h, z5.h\n"
+ ".inst 0x44854668 // smlalt z8.s, p4/M, z19.h, z5.h\n"
+ ".inst 0x454e1bff // usublb z31.h, z31.b, z14.b\n"
+ ".inst 0x4485475b // smlalt z27.s, p4/M, z26.h, z5.h\n"
+ ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n"
+ "ld1b { z5.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x44834266 // smlalb z6.s, p4/M, z19.h, z3.h\n"
+ ".inst 0x4483467e // smlalt z30.s, p4/M, z19.h, z3.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x448342f1 // smlalb z17.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834247 // smlalb z7.s, p4/M, z18.h, z3.h\n"
+ ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x4483479b // smlalt z27.s, p4/M, z28.h, z3.h\n"
+ ".inst 0x44834649 // smlalt z9.s, p4/M, z18.h, z3.h\n"
+ "ld1b { z3.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
+ ".inst 0x448242e6 // smlalb z6.s, p4/M, z23.h, z2.h\n"
+ ".inst 0x448246fe // smlalt z30.s, p4/M, z23.h, z2.h\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x44824031 // smlalb z17.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x44824255 // smlalb z21.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824287 // smlalb z7.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x44824428 // smlalt z8.s, p4/M, z1.h, z2.h\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ "ld1b { z1.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x4482465b // smlalt z27.s, p4/M, z18.h, z2.h\n"
+ ".inst 0x44824689 // smlalt z9.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
"ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
- ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
- "ldr x27, [x17, #0xe8]\n"
- "ldr x26, [x17, #0xf0]\n"
- ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
- "ldr x25, [x17, #0xf8]\n"
- "ldr x24, [x17, #0x100]\n"
- ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
- ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
- "ld1b { z27.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
- ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
- "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
- ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
- "ldr x23, [x17, #0x108]\n"
- "ldr x22, [x17, #0x110]\n"
- ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
- ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
- "ldr x20, [x17, #0x118]\n"
- "whilelt p0.h, x16, x3\n"
- ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
- ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
- "ld1b { z5.h }, p3/Z, [x21, x2]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
- ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
- "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
- "inch x4, ALL, MUL #8\n"
- ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
- ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
- ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
- ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
- "ld1b { z28.h }, p3/Z, [x13, x2]\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
- ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
- ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
- "ld1sb { z19.h }, p4/Z, [x4]\n"
- ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
- ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
- ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
- "ld1b { z16.h }, p3/Z, [x12, x2]\n"
- ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
- ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
- ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
- ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
- ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
- ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
- "ld1b { z26.h }, p3/Z, [x11, x2]\n"
- ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
- ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
- ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
- "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
- ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
- ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
- ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
- ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
- ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
- "ld1b { z8.h }, p3/Z, [x10, x2]\n"
- ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
- ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
- ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
- "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
- ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
- ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
- ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
- ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
- ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
- ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
- ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
- "ld1b { z31.h }, p3/Z, [x9, x2]\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
- ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
- "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
- ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
- ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
- ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
- ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
- ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
- ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
- "ld1b { z0.h }, p3/Z, [x28, x2]\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
- ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
- "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
- ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
- ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
- ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
- ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
- "ld1b { z17.h }, p3/Z, [x27, x2]\n"
- ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
- ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
- ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
- "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
- ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
- ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
- ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
- "ld1b { z29.h }, p3/Z, [x26, x2]\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
- ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
- ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
- ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
- ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
- "ld1b { z3.h }, p3/Z, [x25, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
- ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
- "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
- ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
- ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
- ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
- ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
- ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
- ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44964306 // smlalb z6.s, p4/M, z24.h, z22.h\n"
+ ".inst 0x4496471e // smlalt z30.s, p4/M, z24.h, z22.h\n"
+ "ld1b { z24.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x44964011 // smlalb z17.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x449643b5 // smlalb z21.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x454e1821 // usublb z1.h, z1.b, z14.b\n"
+ ".inst 0x44964167 // smlalb z7.s, p4/M, z11.h, z22.h\n"
+ ".inst 0x44964408 // smlalt z8.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n"
+ ".inst 0x449647bb // smlalt z27.s, p4/M, z29.h, z22.h\n"
+ ".inst 0x44964569 // smlalt z9.s, p4/M, z11.h, z22.h\n"
+ "ld1b { z22.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x44844006 // smlalb z6.s, p4/M, z0.h, z4.h\n"
+ ".inst 0x4484441e // smlalt z30.s, p4/M, z0.h, z4.h\n"
+ "ld1sb { z0.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44844351 // smlalb z17.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x44844175 // smlalb z21.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448443e7 // smlalb z7.s, p4/M, z31.h, z4.h\n"
+ ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n"
+ ".inst 0x454e1ad6 // usublb z22.h, z22.b, z14.b\n"
+ ".inst 0x4484457b // smlalt z27.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x448447e9 // smlalt z9.s, p4/M, z31.h, z4.h\n"
"ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
- ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
- "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x44904346 // smlalb z6.s, p4/M, z26.h, z16.h\n"
+ ".inst 0x4490475e // smlalt z30.s, p4/M, z26.h, z16.h\n"
+ "ld1sb { z26.h }, p4/Z, [x4, #7, MUL VL]\n"
"inch x4, ALL, MUL #8\n"
- ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
- ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
- ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
- ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
- ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
- ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
- ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
- "ld1b { z27.h }, p3/Z, [x23, x2]\n"
- ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
- ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
- ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
- "ld1sb { z21.h }, p4/Z, [x4]\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
- ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
- "inch x4\n"
- ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
- ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
- ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
- ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
- "ld1b { z5.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
- ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44904391 // smlalb z17.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x449043f5 // smlalb z21.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449040a7 // smlalb z7.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x44904788 // smlalt z8.s, p4/M, z28.h, z16.h\n"
+ ".inst 0x454e1884 // usublb z4.h, z4.b, z14.b\n"
+ ".inst 0x449047fb // smlalt z27.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x449044a9 // smlalt z9.s, p4/M, z5.h, z16.h\n"
+ "ld1b { z16.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x454c135a // ssublb z26.h, z26.b, z12.b\n"
".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
- "ld1w { z22.s }, p2/Z, [x15]\n"
- ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
- ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
- ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
- "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
- "addvl x15, x15, #2\n"
- ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
- ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
- ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
- ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
- "uzp1 z25.s, z22.s, z16.s\n"
+ ".inst 0x4499479e // smlalt z30.s, p4/M, z28.h, z25.h\n"
+ "ld1sb { z28.h }, p4/Z, [x4]\n"
+ "inch x4\n"
+ ".inst 0x44994251 // smlalb z17.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x449940b5 // smlalb z21.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994067 // smlalb z7.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994648 // smlalt z8.s, p4/M, z18.h, z25.h\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ ".inst 0x449944bb // smlalt z27.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44994469 // smlalt z9.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z25.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x44934246 // smlalb z6.s, p4/M, z18.h, z19.h\n"
+ ".inst 0x4493465e // smlalt z30.s, p4/M, z18.h, z19.h\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ ".inst 0x44934291 // smlalb z17.s, p4/M, z20.h, z19.h\n"
+ ".inst 0x44934075 // smlalb z21.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934027 // smlalb z7.s, p4/M, z1.h, z19.h\n"
+ ".inst 0x44934688 // smlalt z8.s, p4/M, z20.h, z19.h\n"
+ "ld1w { z20.s }, p1/Z, [x7, #1, MUL VL]\n"
+ ".inst 0x454e1b39 // usublb z25.h, z25.b, z14.b\n"
+ ".inst 0x4493447b // smlalt z27.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x44934429 // smlalt z9.s, p4/M, z1.h, z19.h\n"
+ "ld1b { z19.h }, p3/Z, [x20, x2]\n"
"inch x2\n"
- ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
- ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
- "uzp2 z16.s, z22.s, z16.s\n"
- "ld1w { z22.s }, p2/Z, [x14]\n"
- ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
- ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449743a6 // smlalb z6.s, p4/M, z29.h, z23.h\n"
+ ".inst 0x449747be // smlalt z30.s, p4/M, z29.h, z23.h\n"
+ "addvl x7, x7, #2\n"
+ ".inst 0x44974171 // smlalb z17.s, p4/M, z11.h, z23.h\n"
+ ".inst 0x44974315 // smlalb z21.s, p4/M, z24.h, z23.h\n"
+ "uzp1 z29.s, z18.s, z20.s\n"
+ ".inst 0x449742c7 // smlalb z7.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44974568 // smlalt z8.s, p4/M, z11.h, z23.h\n"
+ "uzp2 z18.s, z18.s, z20.s\n"
+ "ld1w { z20.s }, p2/Z, [x8]\n"
+ ".inst 0x4497471b // smlalt z27.s, p4/M, z24.h, z23.h\n"
+ ".inst 0x449746c9 // smlalt z9.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z24.s }, p1/Z, [x8, #1, MUL VL]\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x44824166 // smlalb z6.s, p4/M, z11.h, z2.h\n"
+ ".inst 0x4482457e // smlalt z30.s, p4/M, z11.h, z2.h\n"
"mov x20, x2\n"
- "incw x20\n"
- ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
- ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
- "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
- "uzp1 z29.s, z22.s, z26.s\n"
- ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
- ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
- "uzp2 z22.s, z22.s, z26.s\n"
"whilelt p2.s, x2, x3\n"
- ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x448243f1 // smlalb z17.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x448242d5 // smlalb z21.s, p4/M, z22.h, z2.h\n"
+ "addvl x8, x8, #2\n"
+ ".inst 0x44824087 // smlalb z7.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n"
+ "uzp1 z23.s, z20.s, z24.s\n"
+ ".inst 0x448246db // smlalt z27.s, p4/M, z22.h, z2.h\n"
+ ".inst 0x44824489 // smlalt z9.s, p4/M, z4.h, z2.h\n"
+ "uzp2 z22.s, z20.s, z24.s\n"
+ "incw x20\n"
+ ".inst 0x448043e6 // smlalb z6.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448047fe // smlalt z30.s, p4/M, z31.h, z0.h\n"
+ ".inst 0x448040b1 // smlalb z17.s, p4/M, z5.h, z0.h\n"
+ ".inst 0x44804095 // smlalb z21.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804207 // smlalb z7.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x448044a8 // smlalt z8.s, p4/M, z5.h, z0.h\n"
"whilelt p1.s, x20, x3\n"
"whilelt p3.h, x2, x3\n"
- ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
- ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
- "addvl x14, x14, #2\n"
- ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
- ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
- ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
- ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
- ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
- ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
- ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
- ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
- ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
- ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
- "and z3.d, z14.d, z29.d\n"
- ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
- ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
- "asr z3.s, z3.s, #0x1f\n"
- ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
- "sqadd z14.s, z14.s, z3.s\n"
- ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
- ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
- ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
- ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
- ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
- ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
- "and z31.d, z23.d, z22.d\n"
- ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
- ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
- ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
- ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
- ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
- ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
- ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
- ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
- ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
- "asr z31.s, z31.s, #0x1f\n"
- "and z3.d, z6.d, z29.d\n"
- ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
- "and z0.d, z9.d, z29.d\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- "and z19.d, z7.d, z29.d\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
- "sqadd z23.s, z23.s, z31.s\n"
- ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
- "asr z3.s, z3.s, #0x1f\n"
- "and z21.d, z18.d, z22.d\n"
- "asr z0.s, z0.s, #0x1f\n"
- "and z17.d, z20.d, z22.d\n"
+ ".inst 0x4480449b // smlalt z27.s, p4/M, z4.h, z0.h\n"
+ ".inst 0x44804609 // smlalt z9.s, p4/M, z16.h, z0.h\n"
+ ".inst 0x449a40a6 // smlalb z6.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a44be // smlalt z30.s, p4/M, z5.h, z26.h\n"
+ ".inst 0x449a4071 // smlalb z17.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a4215 // smlalb z21.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4327 // smlalb z7.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449a4468 // smlalt z8.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x449a461b // smlalt z27.s, p4/M, z16.h, z26.h\n"
+ ".inst 0x449a4729 // smlalt z9.s, p4/M, z25.h, z26.h\n"
+ ".inst 0x449c4066 // smlalb z6.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c447e // smlalt z30.s, p4/M, z3.h, z28.h\n"
+ ".inst 0x449c4031 // smlalb z17.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c4335 // smlalb z21.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4267 // smlalb z7.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x449c4428 // smlalt z8.s, p4/M, z1.h, z28.h\n"
+ ".inst 0x449c473b // smlalt z27.s, p4/M, z25.h, z28.h\n"
+ ".inst 0x449c4669 // smlalt z9.s, p4/M, z19.h, z28.h\n"
+ ".inst 0x04bd74c6 // sqrdmulh z6.s, z6.s, z29.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04bd7631 // sqrdmulh z17.s, z17.s, z29.s\n"
+ ".inst 0x04bd76b5 // sqrdmulh z21.s, z21.s, z29.s\n"
+ "and z19.d, z6.d, z23.d\n"
+ ".inst 0x04bd74e7 // sqrdmulh z7.s, z7.s, z29.s\n"
+ ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
+ "and z16.d, z30.d, z22.d\n"
+ "and z2.d, z17.d, z23.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z20.d, z21.d, z23.d\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "sqadd z6.s, z6.s, z19.s\n"
+ "and z19.d, z7.d, z23.d\n"
+ "and z0.d, z8.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "sqadd z30.s, z30.s, z16.s\n"
+ "and z26.d, z27.d, z22.d\n"
"asr z19.s, z19.s, #0x1f\n"
- "and z16.d, z1.d, z22.d\n"
- "sqadd z6.s, z6.s, z3.s\n"
- "asr z21.s, z21.s, #0x1f\n"
- ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
- "sqadd z9.s, z9.s, z0.s\n"
- "asr z17.s, z17.s, #0x1f\n"
- ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "and z16.d, z9.d, z22.d\n"
+ ".inst 0x448292e6 // srshl z6.s, p4/M, z6.s, z23.s\n"
+ "sqadd z17.s, z17.s, z2.s\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z20.s\n"
+ "asr z26.s, z26.s, #0x1f\n"
+ ".inst 0x448292de // srshl z30.s, p4/M, z30.s, z22.s\n"
"sqadd z7.s, z7.s, z19.s\n"
"asr z16.s, z16.s, #0x1f\n"
- ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
- "sqadd z18.s, z18.s, z21.s\n"
- "sqadd z20.s, z20.s, z17.s\n"
- ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
- ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
- "sqadd z1.s, z1.s, z16.s\n"
- ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
- ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x448292f1 // srshl z17.s, p4/M, z17.s, z23.s\n"
+ "sqadd z8.s, z8.s, z0.s\n"
".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
- ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x448292f5 // srshl z21.s, p4/M, z21.s, z23.s\n"
+ "sqadd z27.s, z27.s, z26.s\n"
+ ".inst 0x448292e7 // srshl z7.s, p4/M, z7.s, z23.s\n"
+ "sqadd z9.s, z9.s, z16.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x448292c8 // srshl z8.s, p4/M, z8.s, z22.s\n"
+ ".inst 0x453042b5 // sqxtnb z21.h, z21.s\n"
+ ".inst 0x453047c6 // sqxtnt z6.h, z30.s\n"
+ ".inst 0x448292db // srshl z27.s, p4/M, z27.s, z22.s\n"
+ ".inst 0x448292c9 // srshl z9.s, p4/M, z9.s, z22.s\n"
".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
- ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
- ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
- ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
- ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
- "sqadd z14.h, z14.h, z15.h\n"
- "smax z14.h, p4/M, z14.h, z12.h\n"
- "smin z14.h, p4/M, z14.h, z13.h\n"
- "sqadd z6.h, z6.h, z15.h\n"
- "sqadd z9.h, z9.h, z15.h\n"
- "smax z6.h, p4/M, z6.h, z12.h\n"
- "smax z9.h, p4/M, z9.h, z12.h\n"
- "sqadd z7.h, z7.h, z15.h\n"
- "smax z7.h, p4/M, z7.h, z12.h\n"
+ ".inst 0x45304511 // sqxtnt z17.h, z8.s\n"
+ ".inst 0x45304775 // sqxtnt z21.h, z27.s\n"
+ ".inst 0x45304527 // sqxtnt z7.h, z9.s\n"
+ "sqadd z6.h, z6.h, z10.h\n"
+ "sqadd z17.h, z17.h, z10.h\n"
+ "sqadd z21.h, z21.h, z10.h\n"
+ "sqadd z7.h, z7.h, z10.h\n"
+ "smax z6.h, p4/M, z6.h, z15.h\n"
+ "smax z17.h, p4/M, z17.h, z15.h\n"
+ "smax z21.h, p4/M, z21.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z15.h\n"
"smin z6.h, p4/M, z6.h, z13.h\n"
- "st1b { z14.h }, p0, [x5, x16]\n"
- "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z17.h, p4/M, z17.h, z13.h\n"
+ "smin z21.h, p4/M, z21.h, z13.h\n"
"smin z7.h, p4/M, z7.h, z13.h\n"
- "st1b { z6.h }, p0, [x6, x16]\n"
- "st1b { z9.h }, p0, [x7, x16]\n"
- "st1b { z7.h }, p0, [x8, x16]\n"
- "ld1w { z17.s }, p2/Z, [x21]\n"
+ "st1b { z6.h }, p0, [x17, x6]\n"
+ "st1b { z17.h }, p0, [x16, x6]\n"
+ "st1b { z21.h }, p0, [x15, x6]\n"
+ "st1b { z7.h }, p0, [x14, x6]\n"
+ "inch x6\n"
+ "ld1w { z21.s }, p2/Z, [x21]\n"
"ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
- "uzp1 z14.s, z17.s, z16.s\n"
- "ld1sb { z26.h }, p4/Z, [x4]\n"
- "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
- "uzp2 z23.s, z17.s, z16.s\n"
"addvl x21, x21, #2\n"
- "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
- "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
- "inch x16\n"
+ "ld1sb { z25.h }, p4/Z, [x4]\n"
+ "ld1sb { z28.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z23.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "ld1sb { z31.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "uzp1 z6.s, z21.s, z16.s\n"
+ "uzp2 z30.s, z21.s, z16.s\n"
"str x21, [%x[params], %[offsetof_Params_bias]]\n"
- "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
- "ldp x9, x28, [x17, #0x0]\n"
- "mov z6.d, z14.d\n"
- "mov z18.d, z23.d\n"
- "ldp x27, x26, [x17, #0x10]\n"
- "ldp x25, x24, [x17, #0x20]\n"
- "mov z9.d, z14.d\n"
- "mov z20.d, z23.d\n"
- "ldp x23, x22, [x17, #0x30]\n"
- "ldp x21, x20, [x17, #0x40]\n"
- "mov z7.d, z14.d\n"
- "mov z1.d, z23.d\n"
- "ld1b { z22.h }, p3/Z, [x9, x2]\n"
- "ld1b { z2.h }, p3/Z, [x28, x2]\n"
- ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
- ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
- "ld1b { z11.h }, p3/Z, [x27, x2]\n"
- "ld1b { z3.h }, p3/Z, [x26, x2]\n"
- ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
- ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
- "ld1b { z29.h }, p3/Z, [x25, x2]\n"
- "ld1b { z4.h }, p3/Z, [x24, x2]\n"
- ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
- ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
- "ld1b { z31.h }, p3/Z, [x23, x2]\n"
- "ld1b { z0.h }, p3/Z, [x22, x2]\n"
- ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
- ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
- "ld1b { z19.h }, p3/Z, [x21, x2]\n"
- "ld1b { z28.h }, p3/Z, [x20, x2]\n"
- ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
- ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
- ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
- ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
- ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
- ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
- ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x454c1339 // ssublb z25.h, z25.b, z12.b\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c12f7 // ssublb z23.h, z23.b, z12.b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov z17.d, z6.d\n"
+ "mov z8.d, z30.d\n"
+ "mov z21.d, z6.d\n"
+ "mov z27.d, z30.d\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov z7.d, z6.d\n"
+ "mov z9.d, z30.d\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ld1b { z26.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z16.h }, p3/Z, [x28, x2]\n"
+ "ld1b { z24.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z5.h }, p3/Z, [x26, x2]\n"
+ "ld1b { z18.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x24, x2]\n"
+ "ld1b { z19.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z11.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x454e1b5a // usublb z26.h, z26.b, z14.b\n"
+ ".inst 0x454e1a10 // usublb z16.h, z16.b, z14.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454e1b18 // usublb z24.h, z24.b, z14.b\n"
+ ".inst 0x454e18a5 // usublb z5.h, z5.b, z14.b\n"
+ ".inst 0x454e1a52 // usublb z18.h, z18.b, z14.b\n"
+ ".inst 0x454e1863 // usublb z3.h, z3.b, z14.b\n"
+ ".inst 0x454e1a73 // usublb z19.h, z19.b, z14.b\n"
+ ".inst 0x454e196b // usublb z11.h, z11.b, z14.b\n"
+ ".inst 0x454e1a94 // usublb z20.h, z20.b, z14.b\n"
+ ".inst 0x454e1bbd // usublb z29.h, z29.b, z14.b\n"
"b.any 1b\n"
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
index d0e8639229..a553f1be9e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -64,10 +64,10 @@ class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
std::unique_ptr<const IDepthfirstStrategy> m_strat;
/* Compute the amount of working space required for a single thread. */
- virtual size_t get_working_size_per_thread() const = 0;
+ virtual size_t get_working_size_per_thread(unsigned int) const = 0;
/* Initialise the working space for a thread. */
- virtual void initialise_working_space(void *) const = 0;
+ virtual void initialise_working_space(void *, unsigned int) const = 0;
/* Compute a portion of the output tensor with padding. */
virtual void compute_tile_padded(
@@ -148,8 +148,8 @@ class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
{
// Get and initialise the working space for this thread.
void *thread_working_space =
- static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
- this->initialise_working_space(thread_working_space);
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_channels);
+ this->initialise_working_space(thread_working_space, n_channels);
// Construct convenient representations of the input/output tensors.
TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
@@ -289,9 +289,14 @@ class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
{
}
- size_t get_working_size(unsigned int n_threads) const override final
+ size_t get_working_size(unsigned int n_threads) const override
{
- return n_threads * this->get_working_size_per_thread();
+ return this->get_working_size(n_threads, this->m_args.n_channels);
+ }
+
+ size_t get_working_size(unsigned int n_threads, unsigned int n_channels) const override final
+ {
+ return n_threads * this->get_working_size_per_thread(n_channels);
}
};
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 5df848d1dd..45315d5a5d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -82,13 +82,13 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr d7, [%x[args], %[offsetof_rescale]]\n"
+ "ldr d8, [%x[args], %[offsetof_rescale]]\n"
"ldr x3, [%x[args], %[offsetof_n_channels]]\n"
- "cmp x3, #0x8\n"
"mov x4, #0x0\n"
+ "mov x5, #0x0\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "mov x5, #0x0\n"
+ "cmp x3, #0x8\n"
"ldp x6, x7, [x21, #0x0]\n"
"ldp x8, x17, [x21, #0x10]\n"
"ldp x16, x15, [x20, #0x0]\n"
@@ -100,142 +100,142 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x60]\n"
"ldp x22, x21, [x20, #0x70]\n"
"blt 3f\n"
- "ldr q6, [x11, x4]\n"
- "ldr q5, [x10, x4]\n"
+ "ldr q7, [x11, x4]\n"
+ "ldr q6, [x10, x4]\n"
"lsr x20, x3, #0x3\n"
+ "ldr q5, [x27, x4]\n"
+ "ldr q4, [x26, x4]\n"
+ "ldr q3, [x15, x4]\n"
+ "ldr q2, [x14, x4]\n"
+ "ldr q1, [x12, x4]\n"
+ "ldr q0, [x28, x4]\n"
"sub x3, x3, x20, LSL #3\n"
- "ldr q4, [x27, x4]\n"
- "ldr q3, [x26, x4]\n"
"subs x20, x20, #0x1\n"
- "ldr q2, [x15, x4]\n"
- "ldr q1, [x14, x4]\n"
- "ldr q0, [x12, x4]\n"
- "ldr q31, [x28, x4]\n"
- "ldr q30, [x9, x4]\n"
- "ldr q29, [x25, x4]\n"
- "ldr q28, [x23, x4]\n"
- "ldr q27, [x22, x4]\n"
- "ldr q26, [x16, x4]\n"
- "ldr q25, [x13, x4]\n"
- "ldr q24, [x24, x4]\n"
- "ldr q23, [x21, x4]\n"
+ "ldr q31, [x9, x4]\n"
+ "ldr q30, [x25, x4]\n"
+ "ldr q29, [x23, x4]\n"
+ "ldr q28, [x22, x4]\n"
+ "ldr q27, [x16, x4]\n"
+ "ldr q26, [x13, x4]\n"
+ "ldr q25, [x24, x4]\n"
+ "ldr q24, [x21, x4]\n"
"add x4, x4, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
- "fadd v17.8h, v6.8h, v5.8h\n"
- "ldr q6, [x11, x4]\n"
- "ldr q5, [x10, x4]\n"
- "fadd v16.8h, v4.8h, v3.8h\n"
- "ldr q4, [x27, x4]\n"
- "ldr q3, [x26, x4]\n"
- "fadd v19.8h, v17.8h, v16.8h\n"
- "fadd v18.8h, v2.8h, v1.8h\n"
- "ldr q2, [x15, x4]\n"
- "ldr q1, [x14, x4]\n"
- "fadd v17.8h, v0.8h, v31.8h\n"
- "fadd v22.8h, v30.8h, v29.8h\n"
- "ldr q0, [x12, x4]\n"
- "ldr q31, [x28, x4]\n"
- "fadd v16.8h, v28.8h, v27.8h\n"
- "fadd v21.8h, v18.8h, v19.8h\n"
- "ldr q30, [x9, x4]\n"
- "ldr q29, [x25, x4]\n"
- "fadd v20.8h, v16.8h, v19.8h\n"
- "fadd v19.8h, v26.8h, v17.8h\n"
- "ldr q28, [x23, x4]\n"
- "ldr q27, [x22, x4]\n"
- "fadd v18.8h, v25.8h, v22.8h\n"
- "fadd v17.8h, v24.8h, v17.8h\n"
- "ldr q26, [x16, x4]\n"
- "ldr q25, [x13, x4]\n"
- "fadd v16.8h, v23.8h, v22.8h\n"
- "fadd v19.8h, v21.8h, v19.8h\n"
- "ldr q24, [x24, x4]\n"
- "ldr q23, [x21, x4]\n"
- "fadd v18.8h, v21.8h, v18.8h\n"
- "fadd v17.8h, v17.8h, v20.8h\n"
- "fadd v16.8h, v16.8h, v20.8h\n"
+ "fadd v19.8h, v7.8h, v6.8h\n"
+ "ldr q7, [x11, x4]\n"
+ "ldr q6, [x10, x4]\n"
+ "fadd v16.8h, v5.8h, v4.8h\n"
+ "ldr q5, [x27, x4]\n"
+ "ldr q4, [x26, x4]\n"
+ "fadd v23.8h, v3.8h, v2.8h\n"
+ "fadd v18.8h, v1.8h, v0.8h\n"
+ "ldr q3, [x15, x4]\n"
+ "ldr q2, [x14, x4]\n"
+ "fadd v17.8h, v31.8h, v30.8h\n"
+ "fadd v22.8h, v29.8h, v28.8h\n"
+ "ldr q1, [x12, x4]\n"
+ "ldr q0, [x28, x4]\n"
+ "fadd v16.8h, v19.8h, v16.8h\n"
"subs x20, x20, #0x1\n"
- "fmul v19.8h, v19.8h, v7.h[0]\n"
+ "ldr q31, [x9, x4]\n"
+ "ldr q30, [x25, x4]\n"
+ "fadd v19.8h, v27.8h, v18.8h\n"
+ "fadd v21.8h, v25.8h, v18.8h\n"
+ "ldr q29, [x23, x4]\n"
+ "ldr q28, [x22, x4]\n"
+ "fadd v18.8h, v26.8h, v17.8h\n"
+ "fadd v20.8h, v24.8h, v17.8h\n"
+ "ldr q27, [x16, x4]\n"
+ "ldr q26, [x13, x4]\n"
+ "fadd v17.8h, v23.8h, v16.8h\n"
+ "fadd v16.8h, v22.8h, v16.8h\n"
+ "ldr q25, [x24, x4]\n"
+ "ldr q24, [x21, x4]\n"
"add x4, x4, #0x10\n"
- "fmul v18.8h, v18.8h, v7.h[1]\n"
- "fmul v17.8h, v17.8h, v7.h[2]\n"
+ "fadd v19.8h, v17.8h, v19.8h\n"
+ "fadd v18.8h, v17.8h, v18.8h\n"
+ "fadd v17.8h, v21.8h, v16.8h\n"
+ "fadd v16.8h, v20.8h, v16.8h\n"
+ "fmul v19.8h, v19.8h, v8.h[0]\n"
+ "fmul v18.8h, v18.8h, v8.h[1]\n"
+ "fmul v17.8h, v17.8h, v8.h[2]\n"
+ "fmul v16.8h, v16.8h, v8.h[3]\n"
"str q19, [x6, x5]\n"
- "fmul v16.8h, v16.8h, v7.h[3]\n"
"str q18, [x7, x5]\n"
"str q17, [x8, x5]\n"
"str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
- "fadd v17.8h, v6.8h, v5.8h\n"
- "fadd v16.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v17.8h, v16.8h\n"
- "fadd v18.8h, v2.8h, v1.8h\n"
- "fadd v17.8h, v0.8h, v31.8h\n"
- "fadd v22.8h, v30.8h, v29.8h\n"
- "fadd v16.8h, v28.8h, v27.8h\n"
- "fadd v21.8h, v18.8h, v19.8h\n"
- "fadd v20.8h, v16.8h, v19.8h\n"
- "fadd v19.8h, v26.8h, v17.8h\n"
- "fadd v18.8h, v25.8h, v22.8h\n"
- "fadd v17.8h, v24.8h, v17.8h\n"
- "fadd v16.8h, v23.8h, v22.8h\n"
- "fadd v19.8h, v21.8h, v19.8h\n"
- "fadd v18.8h, v21.8h, v18.8h\n"
- "fadd v17.8h, v17.8h, v20.8h\n"
- "fadd v16.8h, v16.8h, v20.8h\n"
- "fmul v19.8h, v19.8h, v7.h[0]\n"
+ "fadd v19.8h, v7.8h, v6.8h\n"
+ "fadd v16.8h, v5.8h, v4.8h\n"
+ "fadd v23.8h, v3.8h, v2.8h\n"
+ "fadd v18.8h, v1.8h, v0.8h\n"
+ "fadd v17.8h, v31.8h, v30.8h\n"
+ "fadd v22.8h, v29.8h, v28.8h\n"
+ "fadd v16.8h, v19.8h, v16.8h\n"
+ "fadd v19.8h, v27.8h, v18.8h\n"
+ "fadd v21.8h, v25.8h, v18.8h\n"
+ "fadd v18.8h, v26.8h, v17.8h\n"
+ "fadd v20.8h, v24.8h, v17.8h\n"
+ "fadd v17.8h, v23.8h, v16.8h\n"
+ "fadd v16.8h, v22.8h, v16.8h\n"
+ "fadd v19.8h, v17.8h, v19.8h\n"
+ "fadd v18.8h, v17.8h, v18.8h\n"
+ "fadd v17.8h, v21.8h, v16.8h\n"
+ "fadd v16.8h, v20.8h, v16.8h\n"
+ "fmul v19.8h, v19.8h, v8.h[0]\n"
+ "fmul v18.8h, v18.8h, v8.h[1]\n"
+ "fmul v17.8h, v17.8h, v8.h[2]\n"
+ "fmul v16.8h, v16.8h, v8.h[3]\n"
"str q19, [x6, x5]\n"
- "fmul v18.8h, v18.8h, v7.h[1]\n"
- "fmul v17.8h, v17.8h, v7.h[2]\n"
"str q18, [x7, x5]\n"
- "fmul v16.8h, v16.8h, v7.h[3]\n"
"str q17, [x8, x5]\n"
"str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr h17, [x11, x4]\n"
- "ldr h16, [x10, x4]\n"
- "fadd v18.8h, v17.8h, v16.8h\n"
+ "ldr h22, [x11, x4]\n"
+ "ldr h21, [x10, x4]\n"
"subs x3, x3, #0x1\n"
- "ldr h17, [x27, x4]\n"
+ "ldr h20, [x27, x4]\n"
"ldr h16, [x26, x4]\n"
- "fadd v16.8h, v17.8h, v16.8h\n"
- "fadd v18.8h, v18.8h, v16.8h\n"
- "ldr h17, [x15, x4]\n"
- "ldr h16, [x14, x4]\n"
- "fadd v16.8h, v17.8h, v16.8h\n"
- "fadd v23.8h, v16.8h, v18.8h\n"
- "ldr h17, [x12, x4]\n"
- "ldr h16, [x28, x4]\n"
- "fadd v22.8h, v17.8h, v16.8h\n"
- "ldr h17, [x9, x4]\n"
- "ldr h16, [x25, x4]\n"
- "fadd v21.8h, v17.8h, v16.8h\n"
- "ldr h17, [x23, x4]\n"
+ "ldr h19, [x15, x4]\n"
+ "ldr h18, [x14, x4]\n"
+ "ldr h23, [x12, x4]\n"
+ "ldr h17, [x28, x4]\n"
+ "fadd v22.8h, v22.8h, v21.8h\n"
+ "ldr h27, [x9, x4]\n"
+ "ldr h26, [x25, x4]\n"
+ "fadd v20.8h, v20.8h, v16.8h\n"
+ "ldr h25, [x23, x4]\n"
"ldr h16, [x22, x4]\n"
- "fadd v16.8h, v17.8h, v16.8h\n"
- "fadd v20.8h, v16.8h, v18.8h\n"
- "ldr h17, [x16, x4]\n"
- "ldr h16, [x13, x4]\n"
- "fadd v19.8h, v17.8h, v22.8h\n"
- "fadd v18.8h, v16.8h, v21.8h\n"
+ "fadd v21.8h, v19.8h, v18.8h\n"
+ "ldr h19, [x16, x4]\n"
+ "ldr h18, [x13, x4]\n"
+ "fadd v24.8h, v23.8h, v17.8h\n"
"ldr h17, [x24, x4]\n"
- "ldr h16, [x21, x4]\n"
- "fadd v17.8h, v17.8h, v22.8h\n"
- "fadd v16.8h, v16.8h, v21.8h\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v18.8h, v23.8h, v18.8h\n"
+ "ldr h23, [x21, x4]\n"
+ "fadd v22.8h, v22.8h, v20.8h\n"
+ "fadd v20.8h, v27.8h, v26.8h\n"
+ "fadd v16.8h, v25.8h, v16.8h\n"
"add x4, x4, #0x2\n"
- "fadd v17.8h, v17.8h, v20.8h\n"
- "fadd v16.8h, v16.8h, v20.8h\n"
- "fmul v19.8h, v19.8h, v7.h[0]\n"
- "fmul v18.8h, v18.8h, v7.h[1]\n"
+ "fadd v19.8h, v19.8h, v24.8h\n"
+ "fadd v21.8h, v21.8h, v22.8h\n"
+ "fadd v18.8h, v18.8h, v20.8h\n"
+ "fadd v17.8h, v17.8h, v24.8h\n"
+ "fadd v20.8h, v23.8h, v20.8h\n"
+ "fadd v16.8h, v16.8h, v22.8h\n"
+ "fadd v19.8h, v21.8h, v19.8h\n"
+ "fadd v18.8h, v21.8h, v18.8h\n"
+ "fadd v17.8h, v17.8h, v16.8h\n"
+ "fadd v16.8h, v20.8h, v16.8h\n"
+ "fmul v19.8h, v19.8h, v8.h[0]\n"
+ "fmul v18.8h, v18.8h, v8.h[1]\n"
+ "fmul v17.8h, v17.8h, v8.h[2]\n"
+ "fmul v16.8h, v16.8h, v8.h[3]\n"
"str h19, [x6, x5]\n"
- "fmul v17.8h, v17.8h, v7.h[2]\n"
- "fmul v16.8h, v16.8h, v7.h[3]\n"
"str h18, [x7, x5]\n"
"str h17, [x8, x5]\n"
"str h16, [x17, x5]\n"
@@ -244,7 +244,7 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"4:" // End
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index f7be92e53f..15696d3e76 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,127 +42,127 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
- "ld1r { v9.8h }, [%x[rescale_ptr]]\n"
+ "ld1r { v10.8h }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x20\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v9.16b, #0x0\n"
"movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"movi v7.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fadd v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x21, x26]\n"
- "fadd v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x20, x26]\n"
- "fadd v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x21, x24]\n"
- "fadd v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x20, x24]\n"
- "fadd v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x21, x23]\n"
+ "fadd v23.8h, v5.8h, v4.8h\n"
+ "fadd v19.8h, v3.8h, v2.8h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd v22.8h, v1.8h, v0.8h\n"
+ "fadd v18.8h, v31.8h, v30.8h\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "fadd v21.8h, v29.8h, v21.8h\n"
+ "fadd v17.8h, v28.8h, v27.8h\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fadd v20.8h, v26.8h, v20.8h\n"
"fadd v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"fadd v19.8h, v23.8h, v19.8h\n"
"fadd v18.8h, v22.8h, v18.8h\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"fadd v17.8h, v21.8h, v17.8h\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "fadd v7.8h, v7.8h, v18.8h\n"
- "fadd v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "fadd v5.8h, v5.8h, v16.8h\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "fadd v9.8h, v9.8h, v19.8h\n"
+ "fadd v8.8h, v8.8h, v18.8h\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "fadd v7.8h, v7.8h, v17.8h\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "fadd v6.8h, v6.8h, v16.8h\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd v23.8h, v4.8h, v3.8h\n"
- "fadd v19.8h, v28.8h, v22.8h\n"
- "fadd v22.8h, v2.8h, v1.8h\n"
- "fadd v18.8h, v27.8h, v21.8h\n"
- "fadd v21.8h, v0.8h, v31.8h\n"
- "fadd v17.8h, v26.8h, v20.8h\n"
- "fadd v20.8h, v30.8h, v29.8h\n"
+ "fadd v23.8h, v5.8h, v4.8h\n"
+ "fadd v19.8h, v3.8h, v2.8h\n"
+ "fadd v22.8h, v1.8h, v0.8h\n"
+ "fadd v18.8h, v31.8h, v30.8h\n"
+ "fadd v21.8h, v29.8h, v21.8h\n"
+ "fadd v17.8h, v28.8h, v27.8h\n"
+ "fadd v20.8h, v26.8h, v20.8h\n"
"fadd v16.8h, v25.8h, v24.8h\n"
"fadd v19.8h, v23.8h, v19.8h\n"
"fadd v18.8h, v22.8h, v18.8h\n"
"fadd v17.8h, v21.8h, v17.8h\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "fadd v8.8h, v8.8h, v19.8h\n"
- "fadd v7.8h, v7.8h, v18.8h\n"
- "fadd v6.8h, v6.8h, v17.8h\n"
- "fadd v5.8h, v5.8h, v16.8h\n"
+ "fadd v9.8h, v9.8h, v19.8h\n"
+ "fadd v8.8h, v8.8h, v18.8h\n"
+ "fadd v7.8h, v7.8h, v17.8h\n"
+ "fadd v6.8h, v6.8h, v16.8h\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v16.8h\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "fadd v9.8h, v9.8h, v19.8h\n"
+ "fadd v8.8h, v8.8h, v18.8h\n"
"fadd v7.8h, v7.8h, v17.8h\n"
"fadd v6.8h, v6.8h, v16.8h\n"
- "ldr q16, [x20, x23]\n"
- "fadd v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
+ "fmul v9.8h, v9.8h, v10.8h\n"
+ "fmul v8.8h, v8.8h, v10.8h\n"
"cmp %x[n_channels], #0x20\n"
- "fmul v8.8h, v8.8h, v9.8h\n"
- "fmul v7.8h, v7.8h, v9.8h\n"
- "fmul v6.8h, v6.8h, v9.8h\n"
- "fmul v5.8h, v5.8h, v9.8h\n"
- "str q8, [%x[outptr], x27]\n"
+ "fmul v7.8h, v7.8h, v10.8h\n"
+ "fmul v6.8h, v6.8h, v10.8h\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -170,178 +170,178 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
+ "movi v9.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v17.8h, v4.8h, v3.8h\n"
- "fadd v16.8h, v28.8h, v22.8h\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fadd v16.8h, v17.8h, v16.8h\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "fadd v17.8h, v5.8h, v4.8h\n"
+ "fadd v16.8h, v3.8h, v2.8h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "fadd v8.8h, v8.8h, v16.8h\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v9.8h, v9.8h, v16.8h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v17.8h, v4.8h, v3.8h\n"
- "fadd v16.8h, v28.8h, v22.8h\n"
+ "fadd v17.8h, v5.8h, v4.8h\n"
+ "fadd v16.8h, v3.8h, v2.8h\n"
"fadd v16.8h, v17.8h, v16.8h\n"
- "fadd v8.8h, v8.8h, v16.8h\n"
+ "fadd v9.8h, v9.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x9]\n"
+ "fadd v9.8h, v9.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
+ "fmul v9.8h, v9.8h, v10.8h\n"
"cmp %x[n_channels], #0x8\n"
- "fmul v8.8h, v8.8h, v9.8h\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x0\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v9.16b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fadd v17.8h, v4.8h, v3.8h\n"
- "fadd v16.8h, v28.8h, v22.8h\n"
+ "fadd v17.8h, v5.8h, v4.8h\n"
+ "fadd v16.8h, v3.8h, v2.8h\n"
"subs x25, x25, #0x1\n"
"fadd v16.8h, v17.8h, v16.8h\n"
- "fadd v8.8h, v8.8h, v16.8h\n"
+ "fadd v9.8h, v9.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
- "fadd v8.8h, v8.8h, v4.8h\n"
+ "fadd v9.8h, v9.8h, v5.8h\n"
"bgt 21b\n"
"26:" // Oddments: Single input loop: End
- "fmul v8.8h, v8.8h, v9.8h\n"
+ "fmul v9.8h, v9.8h, v10.8h\n"
"tbz %x[n_channels], #2, 28f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #1, 27f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[6], [%x[outptr]], #0x2\n"
"b 30f\n"
"27:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[4], [%x[outptr]], #0x2\n"
"b 30f\n"
"28:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 29f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[2], [%x[outptr]], #0x2\n"
"b 30f\n"
"29:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[0], [%x[outptr]], #0x2\n"
"30:" // Oddments: Store: Bit 2: End
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 4b073b9076..83293fb4f5 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,11 +65,11 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
- "cmp x16, #0x8\n"
"mov x15, #0x0\n"
+ "mov x14, #0x0\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x12, #0x0\n"
+ "cmp x16, #0x8\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ldp x11, x10, [x21, #0x10]\n"
"ldp x9, x28, [x20, #0x0]\n"
"ldp x27, x26, [x20, #0x10]\n"
@@ -80,14 +80,14 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q30, [x28, x15]\n"
"ldr q29, [x25, x15]\n"
"lsr x20, x16, #0x3\n"
- "sub x16, x16, x20, LSL #3\n"
"ldr q28, [x22, x15]\n"
"ldr q27, [x26, x15]\n"
- "subs x20, x20, #0x1\n"
"ldr q26, [x9, x15]\n"
"ldr q25, [x27, x15]\n"
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
+ "sub x16, x16, x20, LSL #3\n"
+ "subs x20, x20, #0x1\n"
"ldr q22, [x21, x15]\n"
"add x15, x15, #0x10\n"
"beq 2f\n"
@@ -107,62 +107,62 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
"subs x20, x20, #0x1\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
"ldr q22, [x21, x15]\n"
+ "fmax v19.8h, v21.8h, v19.8h\n"
"fmax v18.8h, v18.8h, v21.8h\n"
- "fmax v17.8h, v17.8h, v20.8h\n"
"add x15, x15, #0x10\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "str q19, [x14, x12]\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"fmax v21.8h, v30.8h, v29.8h\n"
"fmax v20.8h, v29.8h, v28.8h\n"
- "fmax v16.8h, v27.8h, v26.8h\n"
+ "fmax v19.8h, v27.8h, v26.8h\n"
"fmax v18.8h, v25.8h, v24.8h\n"
"fmax v17.8h, v27.8h, v23.8h\n"
- "fmax v19.8h, v24.8h, v22.8h\n"
- "fmax v16.8h, v21.8h, v16.8h\n"
+ "fmax v16.8h, v24.8h, v22.8h\n"
+ "fmax v19.8h, v21.8h, v19.8h\n"
"fmax v18.8h, v18.8h, v21.8h\n"
- "str q16, [x14, x12]\n"
"fmax v17.8h, v17.8h, v20.8h\n"
- "fmax v16.8h, v20.8h, v19.8h\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "fmax v16.8h, v20.8h, v16.8h\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
"ldr h16, [x28, x15]\n"
- "ldr h17, [x25, x15]\n"
- "fmax v23.8h, v16.8h, v17.8h\n"
+ "ldr h24, [x25, x15]\n"
"subs x16, x16, #0x1\n"
- "ldr h16, [x22, x15]\n"
- "ldr h22, [x26, x15]\n"
- "fmax v21.8h, v17.8h, v16.8h\n"
- "ldr h16, [x9, x15]\n"
- "ldr h17, [x27, x15]\n"
- "fmax v16.8h, v22.8h, v16.8h\n"
- "fmax v20.8h, v23.8h, v16.8h\n"
- "ldr h19, [x24, x15]\n"
- "ldr h16, [x23, x15]\n"
- "fmax v18.8h, v17.8h, v19.8h\n"
- "fmax v17.8h, v22.8h, v16.8h\n"
+ "ldr h20, [x22, x15]\n"
+ "ldr h23, [x26, x15]\n"
+ "ldr h19, [x9, x15]\n"
+ "ldr h18, [x27, x15]\n"
+ "ldr h22, [x24, x15]\n"
+ "ldr h17, [x23, x15]\n"
+ "fmax v21.8h, v16.8h, v24.8h\n"
"ldr h16, [x21, x15]\n"
- "fmax v16.8h, v19.8h, v16.8h\n"
+ "fmax v20.8h, v24.8h, v20.8h\n"
"add x15, x15, #0x2\n"
- "fmax v18.8h, v18.8h, v23.8h\n"
- "fmax v17.8h, v17.8h, v21.8h\n"
- "fmax v16.8h, v21.8h, v16.8h\n"
- "str h20, [x14, x12]\n"
- "str h18, [x13, x12]\n"
- "str h17, [x11, x12]\n"
- "str h16, [x10, x12]\n"
- "add x12, x12, #0x2\n"
+ "fmax v19.8h, v23.8h, v19.8h\n"
+ "fmax v18.8h, v18.8h, v22.8h\n"
+ "fmax v17.8h, v23.8h, v17.8h\n"
+ "fmax v16.8h, v22.8h, v16.8h\n"
+ "fmax v19.8h, v21.8h, v19.8h\n"
+ "fmax v18.8h, v18.8h, v21.8h\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v16.8h\n"
+ "str h19, [x13, x14]\n"
+ "str h18, [x12, x14]\n"
+ "str h17, [x11, x14]\n"
+ "str h16, [x10, x14]\n"
+ "add x14, x14, #0x2\n"
"bgt 3b\n"
"4:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index c92e2cdebd..d7bf97db02 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,122 +41,122 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x20\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov x24, %x[inptrs]\n"
+ "dup v9.8h, w20\n"
"dup v8.8h, w20\n"
"dup v7.8h, w20\n"
"dup v6.8h, w20\n"
- "dup v5.8h, w20\n"
- "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fmax v22.8h, v2.8h, v1.8h\n"
- "ldr q2, [x21, x26]\n"
- "fmax v18.8h, v27.8h, v21.8h\n"
- "ldr q1, [x20, x26]\n"
- "fmax v21.8h, v0.8h, v31.8h\n"
- "ldr q0, [x21, x24]\n"
- "fmax v17.8h, v26.8h, v20.8h\n"
- "ldr q31, [x20, x24]\n"
- "fmax v20.8h, v30.8h, v29.8h\n"
- "ldr q30, [x21, x23]\n"
+ "fmax v23.8h, v5.8h, v4.8h\n"
+ "fmax v19.8h, v3.8h, v2.8h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax v22.8h, v1.8h, v0.8h\n"
+ "fmax v18.8h, v31.8h, v30.8h\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "fmax v21.8h, v29.8h, v21.8h\n"
+ "fmax v17.8h, v28.8h, v27.8h\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fmax v20.8h, v26.8h, v20.8h\n"
"fmax v16.8h, v25.8h, v24.8h\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"fmax v19.8h, v23.8h, v19.8h\n"
"fmax v18.8h, v22.8h, v18.8h\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"fmax v17.8h, v21.8h, v17.8h\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "fmax v7.8h, v7.8h, v18.8h\n"
- "fmax v6.8h, v6.8h, v17.8h\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "fmax v5.8h, v5.8h, v16.8h\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "fmax v9.8h, v9.8h, v19.8h\n"
+ "fmax v8.8h, v8.8h, v18.8h\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "fmax v7.8h, v7.8h, v17.8h\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "fmax v6.8h, v6.8h, v16.8h\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fmax v23.8h, v4.8h, v3.8h\n"
- "fmax v19.8h, v28.8h, v22.8h\n"
- "fmax v22.8h, v2.8h, v1.8h\n"
- "fmax v18.8h, v27.8h, v21.8h\n"
- "fmax v21.8h, v0.8h, v31.8h\n"
- "fmax v17.8h, v26.8h, v20.8h\n"
- "fmax v20.8h, v30.8h, v29.8h\n"
+ "fmax v23.8h, v5.8h, v4.8h\n"
+ "fmax v19.8h, v3.8h, v2.8h\n"
+ "fmax v22.8h, v1.8h, v0.8h\n"
+ "fmax v18.8h, v31.8h, v30.8h\n"
+ "fmax v21.8h, v29.8h, v21.8h\n"
+ "fmax v17.8h, v28.8h, v27.8h\n"
+ "fmax v20.8h, v26.8h, v20.8h\n"
"fmax v16.8h, v25.8h, v24.8h\n"
"fmax v19.8h, v23.8h, v19.8h\n"
"fmax v18.8h, v22.8h, v18.8h\n"
"fmax v17.8h, v21.8h, v17.8h\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "fmax v8.8h, v8.8h, v19.8h\n"
- "fmax v7.8h, v7.8h, v18.8h\n"
- "fmax v6.8h, v6.8h, v17.8h\n"
- "fmax v5.8h, v5.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v19.8h\n"
+ "fmax v8.8h, v8.8h, v18.8h\n"
+ "fmax v7.8h, v7.8h, v17.8h\n"
+ "fmax v6.8h, v6.8h, v16.8h\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v16.8h\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "fmax v9.8h, v9.8h, v19.8h\n"
+ "fmax v8.8h, v8.8h, v18.8h\n"
"fmax v7.8h, v7.8h, v17.8h\n"
"fmax v6.8h, v6.8h, v16.8h\n"
- "ldr q16, [x20, x23]\n"
- "fmax v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x20\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
"cmp %x[n_channels], #0x20\n"
- "str q8, [%x[outptr], x27]\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
@@ -165,177 +165,177 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
- "dup v8.8h, w20\n"
- "mov x22, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
+ "dup v9.8h, w20\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v17.8h, v4.8h, v3.8h\n"
- "fmax v16.8h, v28.8h, v22.8h\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fmax v16.8h, v17.8h, v16.8h\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "fmax v17.8h, v5.8h, v4.8h\n"
+ "fmax v16.8h, v3.8h, v2.8h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "fmax v8.8h, v8.8h, v16.8h\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v17.8h, v4.8h, v3.8h\n"
- "fmax v16.8h, v28.8h, v22.8h\n"
+ "fmax v17.8h, v5.8h, v4.8h\n"
+ "fmax v16.8h, v3.8h, v2.8h\n"
"fmax v16.8h, v17.8h, v16.8h\n"
- "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x9]\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x8\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"cmp %x[n_channels], #0x8\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
"mov w20, #0xfc00\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
- "dup v8.8h, w20\n"
- "add %x[outptr], %x[outptr], x27\n"
+ "add %x[outptr], %x[outptr], x9\n"
"mov x24, %x[inptrs]\n"
+ "dup v9.8h, w20\n"
"cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fmax v17.8h, v4.8h, v3.8h\n"
- "fmax v16.8h, v28.8h, v22.8h\n"
+ "fmax v17.8h, v5.8h, v4.8h\n"
+ "fmax v16.8h, v3.8h, v2.8h\n"
"subs x25, x25, #0x1\n"
"fmax v16.8h, v17.8h, v16.8h\n"
- "fmax v8.8h, v8.8h, v16.8h\n"
+ "fmax v9.8h, v9.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
"subs x21, x21, #0x1\n"
- "fmax v8.8h, v8.8h, v4.8h\n"
+ "fmax v9.8h, v9.8h, v5.8h\n"
"bgt 21b\n"
"26:" // Oddments: Single input loop: End
"tbz %x[n_channels], #2, 28f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #1, 27f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[6], [%x[outptr]], #0x2\n"
"b 30f\n"
"27:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[4], [%x[outptr]], #0x2\n"
"b 30f\n"
"28:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 29f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[2], [%x[outptr]], #0x2\n"
"b 30f\n"
"29:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[0], [%x[outptr]], #0x2\n"
"30:" // Oddments: Store: Bit 2: End
"31:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index cf0047638e..86095a6f2c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -82,13 +82,13 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr q7, [%x[args], %[offsetof_rescale]]\n"
+ "ldr q8, [%x[args], %[offsetof_rescale]]\n"
"ldr x3, [%x[args], %[offsetof_n_channels]]\n"
- "cmp x3, #0x4\n"
"mov x4, #0x0\n"
+ "mov x5, #0x0\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "mov x5, #0x0\n"
+ "cmp x3, #0x4\n"
"ldp x6, x7, [x21, #0x0]\n"
"ldp x8, x17, [x21, #0x10]\n"
"ldp x16, x15, [x20, #0x0]\n"
@@ -100,142 +100,142 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x24, x23, [x20, #0x60]\n"
"ldp x22, x21, [x20, #0x70]\n"
"blt 3f\n"
- "ldr q6, [x11, x4]\n"
- "ldr q5, [x10, x4]\n"
+ "ldr q7, [x11, x4]\n"
+ "ldr q6, [x10, x4]\n"
"lsr x20, x3, #0x2\n"
+ "ldr q5, [x27, x4]\n"
+ "ldr q4, [x26, x4]\n"
+ "ldr q3, [x15, x4]\n"
+ "ldr q2, [x14, x4]\n"
+ "ldr q1, [x12, x4]\n"
+ "ldr q0, [x28, x4]\n"
"sub x3, x3, x20, LSL #2\n"
- "ldr q4, [x27, x4]\n"
- "ldr q3, [x26, x4]\n"
"subs x20, x20, #0x1\n"
- "ldr q2, [x15, x4]\n"
- "ldr q1, [x14, x4]\n"
- "ldr q0, [x12, x4]\n"
- "ldr q31, [x28, x4]\n"
- "ldr q30, [x9, x4]\n"
- "ldr q29, [x25, x4]\n"
- "ldr q28, [x23, x4]\n"
- "ldr q27, [x22, x4]\n"
- "ldr q26, [x16, x4]\n"
- "ldr q25, [x13, x4]\n"
- "ldr q24, [x24, x4]\n"
- "ldr q23, [x21, x4]\n"
+ "ldr q31, [x9, x4]\n"
+ "ldr q30, [x25, x4]\n"
+ "ldr q29, [x23, x4]\n"
+ "ldr q28, [x22, x4]\n"
+ "ldr q27, [x16, x4]\n"
+ "ldr q26, [x13, x4]\n"
+ "ldr q25, [x24, x4]\n"
+ "ldr q24, [x21, x4]\n"
"add x4, x4, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
- "fadd v17.4s, v6.4s, v5.4s\n"
- "ldr q6, [x11, x4]\n"
- "ldr q5, [x10, x4]\n"
- "fadd v16.4s, v4.4s, v3.4s\n"
- "ldr q4, [x27, x4]\n"
- "ldr q3, [x26, x4]\n"
- "fadd v19.4s, v17.4s, v16.4s\n"
- "fadd v18.4s, v2.4s, v1.4s\n"
- "ldr q2, [x15, x4]\n"
- "ldr q1, [x14, x4]\n"
- "fadd v17.4s, v0.4s, v31.4s\n"
- "fadd v22.4s, v30.4s, v29.4s\n"
- "ldr q0, [x12, x4]\n"
- "ldr q31, [x28, x4]\n"
- "fadd v16.4s, v28.4s, v27.4s\n"
- "fadd v21.4s, v18.4s, v19.4s\n"
- "ldr q30, [x9, x4]\n"
- "ldr q29, [x25, x4]\n"
- "fadd v20.4s, v16.4s, v19.4s\n"
- "fadd v19.4s, v26.4s, v17.4s\n"
- "ldr q28, [x23, x4]\n"
- "ldr q27, [x22, x4]\n"
- "fadd v18.4s, v25.4s, v22.4s\n"
- "fadd v17.4s, v24.4s, v17.4s\n"
- "ldr q26, [x16, x4]\n"
- "ldr q25, [x13, x4]\n"
- "fadd v16.4s, v23.4s, v22.4s\n"
- "fadd v19.4s, v21.4s, v19.4s\n"
- "ldr q24, [x24, x4]\n"
- "ldr q23, [x21, x4]\n"
- "fadd v18.4s, v21.4s, v18.4s\n"
- "fadd v17.4s, v17.4s, v20.4s\n"
- "fadd v16.4s, v16.4s, v20.4s\n"
+ "fadd v19.4s, v7.4s, v6.4s\n"
+ "ldr q7, [x11, x4]\n"
+ "ldr q6, [x10, x4]\n"
+ "fadd v16.4s, v5.4s, v4.4s\n"
+ "ldr q5, [x27, x4]\n"
+ "ldr q4, [x26, x4]\n"
+ "fadd v23.4s, v3.4s, v2.4s\n"
+ "fadd v18.4s, v1.4s, v0.4s\n"
+ "ldr q3, [x15, x4]\n"
+ "ldr q2, [x14, x4]\n"
+ "fadd v17.4s, v31.4s, v30.4s\n"
+ "fadd v22.4s, v29.4s, v28.4s\n"
+ "ldr q1, [x12, x4]\n"
+ "ldr q0, [x28, x4]\n"
+ "fadd v16.4s, v19.4s, v16.4s\n"
"subs x20, x20, #0x1\n"
- "fmul v19.4s, v19.4s, v7.s[0]\n"
+ "ldr q31, [x9, x4]\n"
+ "ldr q30, [x25, x4]\n"
+ "fadd v19.4s, v27.4s, v18.4s\n"
+ "fadd v21.4s, v25.4s, v18.4s\n"
+ "ldr q29, [x23, x4]\n"
+ "ldr q28, [x22, x4]\n"
+ "fadd v18.4s, v26.4s, v17.4s\n"
+ "fadd v20.4s, v24.4s, v17.4s\n"
+ "ldr q27, [x16, x4]\n"
+ "ldr q26, [x13, x4]\n"
+ "fadd v17.4s, v23.4s, v16.4s\n"
+ "fadd v16.4s, v22.4s, v16.4s\n"
+ "ldr q25, [x24, x4]\n"
+ "ldr q24, [x21, x4]\n"
"add x4, x4, #0x10\n"
- "fmul v18.4s, v18.4s, v7.s[1]\n"
- "fmul v17.4s, v17.4s, v7.s[2]\n"
+ "fadd v19.4s, v17.4s, v19.4s\n"
+ "fadd v18.4s, v17.4s, v18.4s\n"
+ "fadd v17.4s, v21.4s, v16.4s\n"
+ "fadd v16.4s, v20.4s, v16.4s\n"
+ "fmul v19.4s, v19.4s, v8.s[0]\n"
+ "fmul v18.4s, v18.4s, v8.s[1]\n"
+ "fmul v17.4s, v17.4s, v8.s[2]\n"
+ "fmul v16.4s, v16.4s, v8.s[3]\n"
"str q19, [x6, x5]\n"
- "fmul v16.4s, v16.4s, v7.s[3]\n"
"str q18, [x7, x5]\n"
"str q17, [x8, x5]\n"
"str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
- "fadd v17.4s, v6.4s, v5.4s\n"
- "fadd v16.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v17.4s, v16.4s\n"
- "fadd v18.4s, v2.4s, v1.4s\n"
- "fadd v17.4s, v0.4s, v31.4s\n"
- "fadd v22.4s, v30.4s, v29.4s\n"
- "fadd v16.4s, v28.4s, v27.4s\n"
- "fadd v21.4s, v18.4s, v19.4s\n"
- "fadd v20.4s, v16.4s, v19.4s\n"
- "fadd v19.4s, v26.4s, v17.4s\n"
- "fadd v18.4s, v25.4s, v22.4s\n"
- "fadd v17.4s, v24.4s, v17.4s\n"
- "fadd v16.4s, v23.4s, v22.4s\n"
- "fadd v19.4s, v21.4s, v19.4s\n"
- "fadd v18.4s, v21.4s, v18.4s\n"
- "fadd v17.4s, v17.4s, v20.4s\n"
- "fadd v16.4s, v16.4s, v20.4s\n"
- "fmul v19.4s, v19.4s, v7.s[0]\n"
+ "fadd v19.4s, v7.4s, v6.4s\n"
+ "fadd v16.4s, v5.4s, v4.4s\n"
+ "fadd v23.4s, v3.4s, v2.4s\n"
+ "fadd v18.4s, v1.4s, v0.4s\n"
+ "fadd v17.4s, v31.4s, v30.4s\n"
+ "fadd v22.4s, v29.4s, v28.4s\n"
+ "fadd v16.4s, v19.4s, v16.4s\n"
+ "fadd v19.4s, v27.4s, v18.4s\n"
+ "fadd v21.4s, v25.4s, v18.4s\n"
+ "fadd v18.4s, v26.4s, v17.4s\n"
+ "fadd v20.4s, v24.4s, v17.4s\n"
+ "fadd v17.4s, v23.4s, v16.4s\n"
+ "fadd v16.4s, v22.4s, v16.4s\n"
+ "fadd v19.4s, v17.4s, v19.4s\n"
+ "fadd v18.4s, v17.4s, v18.4s\n"
+ "fadd v17.4s, v21.4s, v16.4s\n"
+ "fadd v16.4s, v20.4s, v16.4s\n"
+ "fmul v19.4s, v19.4s, v8.s[0]\n"
+ "fmul v18.4s, v18.4s, v8.s[1]\n"
+ "fmul v17.4s, v17.4s, v8.s[2]\n"
+ "fmul v16.4s, v16.4s, v8.s[3]\n"
"str q19, [x6, x5]\n"
- "fmul v18.4s, v18.4s, v7.s[1]\n"
- "fmul v17.4s, v17.4s, v7.s[2]\n"
"str q18, [x7, x5]\n"
- "fmul v16.4s, v16.4s, v7.s[3]\n"
"str q17, [x8, x5]\n"
"str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
"cbz x3, 4f\n"
"3:" // Oddments
- "ldr s17, [x11, x4]\n"
- "ldr s16, [x10, x4]\n"
- "fadd v18.4s, v17.4s, v16.4s\n"
+ "ldr s22, [x11, x4]\n"
+ "ldr s21, [x10, x4]\n"
"subs x3, x3, #0x1\n"
- "ldr s17, [x27, x4]\n"
+ "ldr s20, [x27, x4]\n"
"ldr s16, [x26, x4]\n"
- "fadd v16.4s, v17.4s, v16.4s\n"
- "fadd v18.4s, v18.4s, v16.4s\n"
- "ldr s17, [x15, x4]\n"
- "ldr s16, [x14, x4]\n"
- "fadd v16.4s, v17.4s, v16.4s\n"
- "fadd v23.4s, v16.4s, v18.4s\n"
- "ldr s17, [x12, x4]\n"
- "ldr s16, [x28, x4]\n"
- "fadd v22.4s, v17.4s, v16.4s\n"
- "ldr s17, [x9, x4]\n"
- "ldr s16, [x25, x4]\n"
- "fadd v21.4s, v17.4s, v16.4s\n"
- "ldr s17, [x23, x4]\n"
+ "ldr s19, [x15, x4]\n"
+ "ldr s18, [x14, x4]\n"
+ "ldr s23, [x12, x4]\n"
+ "ldr s17, [x28, x4]\n"
+ "fadd v22.4s, v22.4s, v21.4s\n"
+ "ldr s27, [x9, x4]\n"
+ "ldr s26, [x25, x4]\n"
+ "fadd v20.4s, v20.4s, v16.4s\n"
+ "ldr s25, [x23, x4]\n"
"ldr s16, [x22, x4]\n"
- "fadd v16.4s, v17.4s, v16.4s\n"
- "fadd v20.4s, v16.4s, v18.4s\n"
- "ldr s17, [x16, x4]\n"
- "ldr s16, [x13, x4]\n"
- "fadd v19.4s, v17.4s, v22.4s\n"
- "fadd v18.4s, v16.4s, v21.4s\n"
+ "fadd v21.4s, v19.4s, v18.4s\n"
+ "ldr s19, [x16, x4]\n"
+ "ldr s18, [x13, x4]\n"
+ "fadd v24.4s, v23.4s, v17.4s\n"
"ldr s17, [x24, x4]\n"
- "ldr s16, [x21, x4]\n"
- "fadd v17.4s, v17.4s, v22.4s\n"
- "fadd v16.4s, v16.4s, v21.4s\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v18.4s, v23.4s, v18.4s\n"
+ "ldr s23, [x21, x4]\n"
+ "fadd v22.4s, v22.4s, v20.4s\n"
+ "fadd v20.4s, v27.4s, v26.4s\n"
+ "fadd v16.4s, v25.4s, v16.4s\n"
"add x4, x4, #0x4\n"
- "fadd v17.4s, v17.4s, v20.4s\n"
- "fadd v16.4s, v16.4s, v20.4s\n"
- "fmul v19.4s, v19.4s, v7.s[0]\n"
- "fmul v18.4s, v18.4s, v7.s[1]\n"
+ "fadd v19.4s, v19.4s, v24.4s\n"
+ "fadd v21.4s, v21.4s, v22.4s\n"
+ "fadd v18.4s, v18.4s, v20.4s\n"
+ "fadd v17.4s, v17.4s, v24.4s\n"
+ "fadd v20.4s, v23.4s, v20.4s\n"
+ "fadd v16.4s, v16.4s, v22.4s\n"
+ "fadd v19.4s, v21.4s, v19.4s\n"
+ "fadd v18.4s, v21.4s, v18.4s\n"
+ "fadd v17.4s, v17.4s, v16.4s\n"
+ "fadd v16.4s, v20.4s, v16.4s\n"
+ "fmul v19.4s, v19.4s, v8.s[0]\n"
+ "fmul v18.4s, v18.4s, v8.s[1]\n"
+ "fmul v17.4s, v17.4s, v8.s[2]\n"
+ "fmul v16.4s, v16.4s, v8.s[3]\n"
"str s19, [x6, x5]\n"
- "fmul v17.4s, v17.4s, v7.s[2]\n"
- "fmul v16.4s, v16.4s, v7.s[3]\n"
"str s18, [x7, x5]\n"
"str s17, [x8, x5]\n"
"str s16, [x17, x5]\n"
@@ -244,7 +244,7 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"4:" // End
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index d236f07b1c..71450f56e2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,127 +42,127 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
- "ld1r { v9.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v10.4s }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x10\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v9.16b, #0x0\n"
"movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"movi v7.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fadd v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x21, x26]\n"
- "fadd v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x20, x26]\n"
- "fadd v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x21, x24]\n"
- "fadd v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x20, x24]\n"
- "fadd v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x21, x23]\n"
+ "fadd v23.4s, v5.4s, v4.4s\n"
+ "fadd v19.4s, v3.4s, v2.4s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd v22.4s, v1.4s, v0.4s\n"
+ "fadd v18.4s, v31.4s, v30.4s\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "fadd v21.4s, v29.4s, v21.4s\n"
+ "fadd v17.4s, v28.4s, v27.4s\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fadd v20.4s, v26.4s, v20.4s\n"
"fadd v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"fadd v19.4s, v23.4s, v19.4s\n"
"fadd v18.4s, v22.4s, v18.4s\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"fadd v17.4s, v21.4s, v17.4s\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "fadd v7.4s, v7.4s, v18.4s\n"
- "fadd v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "fadd v5.4s, v5.4s, v16.4s\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "fadd v9.4s, v9.4s, v19.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "fadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "fadd v6.4s, v6.4s, v16.4s\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd v23.4s, v4.4s, v3.4s\n"
- "fadd v19.4s, v28.4s, v22.4s\n"
- "fadd v22.4s, v2.4s, v1.4s\n"
- "fadd v18.4s, v27.4s, v21.4s\n"
- "fadd v21.4s, v0.4s, v31.4s\n"
- "fadd v17.4s, v26.4s, v20.4s\n"
- "fadd v20.4s, v30.4s, v29.4s\n"
+ "fadd v23.4s, v5.4s, v4.4s\n"
+ "fadd v19.4s, v3.4s, v2.4s\n"
+ "fadd v22.4s, v1.4s, v0.4s\n"
+ "fadd v18.4s, v31.4s, v30.4s\n"
+ "fadd v21.4s, v29.4s, v21.4s\n"
+ "fadd v17.4s, v28.4s, v27.4s\n"
+ "fadd v20.4s, v26.4s, v20.4s\n"
"fadd v16.4s, v25.4s, v24.4s\n"
"fadd v19.4s, v23.4s, v19.4s\n"
"fadd v18.4s, v22.4s, v18.4s\n"
"fadd v17.4s, v21.4s, v17.4s\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "fadd v8.4s, v8.4s, v19.4s\n"
- "fadd v7.4s, v7.4s, v18.4s\n"
- "fadd v6.4s, v6.4s, v17.4s\n"
- "fadd v5.4s, v5.4s, v16.4s\n"
+ "fadd v9.4s, v9.4s, v19.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
+ "fadd v7.4s, v7.4s, v17.4s\n"
+ "fadd v6.4s, v6.4s, v16.4s\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v16.4s\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "fadd v9.4s, v9.4s, v19.4s\n"
+ "fadd v8.4s, v8.4s, v18.4s\n"
"fadd v7.4s, v7.4s, v17.4s\n"
"fadd v6.4s, v6.4s, v16.4s\n"
- "ldr q16, [x20, x23]\n"
- "fadd v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "fmul v9.4s, v9.4s, v10.4s\n"
+ "fmul v8.4s, v8.4s, v10.4s\n"
"cmp %x[n_channels], #0x10\n"
- "fmul v8.4s, v8.4s, v9.4s\n"
- "fmul v7.4s, v7.4s, v9.4s\n"
- "fmul v6.4s, v6.4s, v9.4s\n"
- "fmul v5.4s, v5.4s, v9.4s\n"
- "str q8, [%x[outptr], x27]\n"
+ "fmul v7.4s, v7.4s, v10.4s\n"
+ "fmul v6.4s, v6.4s, v10.4s\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -170,130 +170,130 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
+ "movi v9.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v17.4s, v4.4s, v3.4s\n"
- "fadd v16.4s, v28.4s, v22.4s\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fadd v16.4s, v17.4s, v16.4s\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "fadd v17.4s, v5.4s, v4.4s\n"
+ "fadd v16.4s, v3.4s, v2.4s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "fadd v8.4s, v8.4s, v16.4s\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v9.4s, v9.4s, v16.4s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v17.4s, v4.4s, v3.4s\n"
- "fadd v16.4s, v28.4s, v22.4s\n"
+ "fadd v17.4s, v5.4s, v4.4s\n"
+ "fadd v16.4s, v3.4s, v2.4s\n"
"fadd v16.4s, v17.4s, v16.4s\n"
- "fadd v8.4s, v8.4s, v16.4s\n"
+ "fadd v9.4s, v9.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x9]\n"
+ "fadd v9.4s, v9.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
+ "fmul v9.4s, v9.4s, v10.4s\n"
"cmp %x[n_channels], #0x4\n"
- "fmul v8.4s, v8.4s, v9.4s\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x0\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v9.16b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fadd v17.4s, v4.4s, v3.4s\n"
- "fadd v16.4s, v28.4s, v22.4s\n"
+ "fadd v17.4s, v5.4s, v4.4s\n"
+ "fadd v16.4s, v3.4s, v2.4s\n"
"subs x25, x25, #0x1\n"
"fadd v16.4s, v17.4s, v16.4s\n"
- "fadd v8.4s, v8.4s, v16.4s\n"
+ "fadd v9.4s, v9.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
- "fadd v8.4s, v8.4s, v4.4s\n"
+ "fadd v9.4s, v9.4s, v5.4s\n"
"bgt 19b\n"
"22:" // Oddments: Single input loop: End
- "fmul v8.4s, v8.4s, v9.4s\n"
+ "fmul v9.4s, v9.4s, v10.4s\n"
"tbz %x[n_channels], #1, 23f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"b 24f\n"
"23:" // Oddments: Store: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"24:" // Oddments: Store: Bit 1: End
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f4202de1ed..9fa8e7c609 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,11 +65,11 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
- "cmp x16, #0x4\n"
"mov x15, #0x0\n"
+ "mov x14, #0x0\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x12, #0x0\n"
+ "cmp x16, #0x4\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ldp x11, x10, [x21, #0x10]\n"
"ldp x9, x28, [x20, #0x0]\n"
"ldp x27, x26, [x20, #0x10]\n"
@@ -80,14 +80,14 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q30, [x28, x15]\n"
"ldr q29, [x25, x15]\n"
"lsr x20, x16, #0x2\n"
- "sub x16, x16, x20, LSL #2\n"
"ldr q28, [x22, x15]\n"
"ldr q27, [x26, x15]\n"
- "subs x20, x20, #0x1\n"
"ldr q26, [x9, x15]\n"
"ldr q25, [x27, x15]\n"
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
+ "sub x16, x16, x20, LSL #2\n"
+ "subs x20, x20, #0x1\n"
"ldr q22, [x21, x15]\n"
"add x15, x15, #0x10\n"
"beq 2f\n"
@@ -107,62 +107,62 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
"subs x20, x20, #0x1\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
"ldr q22, [x21, x15]\n"
+ "fmax v19.4s, v21.4s, v19.4s\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "fmax v17.4s, v17.4s, v20.4s\n"
"add x15, x15, #0x10\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "str q19, [x14, x12]\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"fmax v21.4s, v30.4s, v29.4s\n"
"fmax v20.4s, v29.4s, v28.4s\n"
- "fmax v16.4s, v27.4s, v26.4s\n"
+ "fmax v19.4s, v27.4s, v26.4s\n"
"fmax v18.4s, v25.4s, v24.4s\n"
"fmax v17.4s, v27.4s, v23.4s\n"
- "fmax v19.4s, v24.4s, v22.4s\n"
- "fmax v16.4s, v21.4s, v16.4s\n"
+ "fmax v16.4s, v24.4s, v22.4s\n"
+ "fmax v19.4s, v21.4s, v19.4s\n"
"fmax v18.4s, v18.4s, v21.4s\n"
- "str q16, [x14, x12]\n"
"fmax v17.4s, v17.4s, v20.4s\n"
- "fmax v16.4s, v20.4s, v19.4s\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "fmax v16.4s, v20.4s, v16.4s\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
"ldr s16, [x28, x15]\n"
- "ldr s17, [x25, x15]\n"
- "fmax v23.4s, v16.4s, v17.4s\n"
+ "ldr s24, [x25, x15]\n"
"subs x16, x16, #0x1\n"
- "ldr s16, [x22, x15]\n"
- "ldr s22, [x26, x15]\n"
- "fmax v21.4s, v17.4s, v16.4s\n"
- "ldr s16, [x9, x15]\n"
- "ldr s17, [x27, x15]\n"
- "fmax v16.4s, v22.4s, v16.4s\n"
- "fmax v20.4s, v23.4s, v16.4s\n"
- "ldr s19, [x24, x15]\n"
- "ldr s16, [x23, x15]\n"
- "fmax v18.4s, v17.4s, v19.4s\n"
- "fmax v17.4s, v22.4s, v16.4s\n"
+ "ldr s20, [x22, x15]\n"
+ "ldr s23, [x26, x15]\n"
+ "ldr s19, [x9, x15]\n"
+ "ldr s18, [x27, x15]\n"
+ "ldr s22, [x24, x15]\n"
+ "ldr s17, [x23, x15]\n"
+ "fmax v21.4s, v16.4s, v24.4s\n"
"ldr s16, [x21, x15]\n"
- "fmax v16.4s, v19.4s, v16.4s\n"
+ "fmax v20.4s, v24.4s, v20.4s\n"
"add x15, x15, #0x4\n"
- "fmax v18.4s, v18.4s, v23.4s\n"
- "fmax v17.4s, v17.4s, v21.4s\n"
- "fmax v16.4s, v21.4s, v16.4s\n"
- "str s20, [x14, x12]\n"
- "str s18, [x13, x12]\n"
- "str s17, [x11, x12]\n"
- "str s16, [x10, x12]\n"
- "add x12, x12, #0x4\n"
+ "fmax v19.4s, v23.4s, v19.4s\n"
+ "fmax v18.4s, v18.4s, v22.4s\n"
+ "fmax v17.4s, v23.4s, v17.4s\n"
+ "fmax v16.4s, v22.4s, v16.4s\n"
+ "fmax v19.4s, v21.4s, v19.4s\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v16.4s\n"
+ "str s19, [x13, x14]\n"
+ "str s18, [x12, x14]\n"
+ "str s17, [x11, x14]\n"
+ "str s16, [x10, x14]\n"
+ "add x14, x14, #0x4\n"
"bgt 3b\n"
"4:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index f4706635dc..317966d53a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,122 +41,122 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x10\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov x24, %x[inptrs]\n"
+ "dup v9.4s, w20\n"
"dup v8.4s, w20\n"
"dup v7.4s, w20\n"
"dup v6.4s, w20\n"
- "dup v5.4s, w20\n"
- "mov x22, %x[inptrs]\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fmax v22.4s, v2.4s, v1.4s\n"
- "ldr q2, [x21, x26]\n"
- "fmax v18.4s, v27.4s, v21.4s\n"
- "ldr q1, [x20, x26]\n"
- "fmax v21.4s, v0.4s, v31.4s\n"
- "ldr q0, [x21, x24]\n"
- "fmax v17.4s, v26.4s, v20.4s\n"
- "ldr q31, [x20, x24]\n"
- "fmax v20.4s, v30.4s, v29.4s\n"
- "ldr q30, [x21, x23]\n"
+ "fmax v23.4s, v5.4s, v4.4s\n"
+ "fmax v19.4s, v3.4s, v2.4s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax v22.4s, v1.4s, v0.4s\n"
+ "fmax v18.4s, v31.4s, v30.4s\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "fmax v21.4s, v29.4s, v21.4s\n"
+ "fmax v17.4s, v28.4s, v27.4s\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fmax v20.4s, v26.4s, v20.4s\n"
"fmax v16.4s, v25.4s, v24.4s\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"fmax v19.4s, v23.4s, v19.4s\n"
"fmax v18.4s, v22.4s, v18.4s\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"fmax v17.4s, v21.4s, v17.4s\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "fmax v7.4s, v7.4s, v18.4s\n"
- "fmax v6.4s, v6.4s, v17.4s\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "fmax v5.4s, v5.4s, v16.4s\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "fmax v9.4s, v9.4s, v19.4s\n"
+ "fmax v8.4s, v8.4s, v18.4s\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "fmax v6.4s, v6.4s, v16.4s\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fmax v23.4s, v4.4s, v3.4s\n"
- "fmax v19.4s, v28.4s, v22.4s\n"
- "fmax v22.4s, v2.4s, v1.4s\n"
- "fmax v18.4s, v27.4s, v21.4s\n"
- "fmax v21.4s, v0.4s, v31.4s\n"
- "fmax v17.4s, v26.4s, v20.4s\n"
- "fmax v20.4s, v30.4s, v29.4s\n"
+ "fmax v23.4s, v5.4s, v4.4s\n"
+ "fmax v19.4s, v3.4s, v2.4s\n"
+ "fmax v22.4s, v1.4s, v0.4s\n"
+ "fmax v18.4s, v31.4s, v30.4s\n"
+ "fmax v21.4s, v29.4s, v21.4s\n"
+ "fmax v17.4s, v28.4s, v27.4s\n"
+ "fmax v20.4s, v26.4s, v20.4s\n"
"fmax v16.4s, v25.4s, v24.4s\n"
"fmax v19.4s, v23.4s, v19.4s\n"
"fmax v18.4s, v22.4s, v18.4s\n"
"fmax v17.4s, v21.4s, v17.4s\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "fmax v8.4s, v8.4s, v19.4s\n"
- "fmax v7.4s, v7.4s, v18.4s\n"
- "fmax v6.4s, v6.4s, v17.4s\n"
- "fmax v5.4s, v5.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v19.4s\n"
+ "fmax v8.4s, v8.4s, v18.4s\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v6.4s, v6.4s, v16.4s\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v16.4s\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "fmax v9.4s, v9.4s, v19.4s\n"
+ "fmax v8.4s, v8.4s, v18.4s\n"
"fmax v7.4s, v7.4s, v17.4s\n"
"fmax v6.4s, v6.4s, v16.4s\n"
- "ldr q16, [x20, x23]\n"
- "fmax v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x27]\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
@@ -165,129 +165,129 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
- "dup v8.4s, w20\n"
- "mov x22, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
+ "dup v9.4s, w20\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v17.4s, v4.4s, v3.4s\n"
- "fmax v16.4s, v28.4s, v22.4s\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "fmax v16.4s, v17.4s, v16.4s\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "fmax v17.4s, v5.4s, v4.4s\n"
+ "fmax v16.4s, v3.4s, v2.4s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "fmax v8.4s, v8.4s, v16.4s\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v17.4s, v4.4s, v3.4s\n"
- "fmax v16.4s, v28.4s, v22.4s\n"
+ "fmax v17.4s, v5.4s, v4.4s\n"
+ "fmax v16.4s, v3.4s, v2.4s\n"
"fmax v16.4s, v17.4s, v16.4s\n"
- "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x9]\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x4\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"cmp %x[n_channels], #0x4\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
"mov w20, #0xff800000\n"
"lsr x25, %x[n_valid_cells], #0x2\n"
- "dup v8.4s, w20\n"
- "add %x[outptr], %x[outptr], x27\n"
+ "add %x[outptr], %x[outptr], x9\n"
"mov x24, %x[inptrs]\n"
+ "dup v9.4s, w20\n"
"cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fmax v17.4s, v4.4s, v3.4s\n"
- "fmax v16.4s, v28.4s, v22.4s\n"
+ "fmax v17.4s, v5.4s, v4.4s\n"
+ "fmax v16.4s, v3.4s, v2.4s\n"
"subs x25, x25, #0x1\n"
"fmax v16.4s, v17.4s, v16.4s\n"
- "fmax v8.4s, v8.4s, v16.4s\n"
+ "fmax v9.4s, v9.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
"subs x21, x21, #0x1\n"
- "fmax v8.4s, v8.4s, v4.4s\n"
+ "fmax v9.4s, v9.4s, v5.4s\n"
"bgt 19b\n"
"22:" // Oddments: Single input loop: End
"tbz %x[n_channels], #1, 23f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"b 24f\n"
"23:" // Oddments: Store: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"24:" // Oddments: Store: Bit 1: End
"25:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 5d082102b3..63796ab4a4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -122,9 +122,9 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"ldr q29, [x21, x26]\n"
"ldr q28, [x20, x26]\n"
@@ -137,26 +137,26 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
- "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x21, x26]\n"
- "ldr q28, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x21, x25]\n"
- "ldr q26, [x20, x25]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v17.8h, v25.8b, v24.8b\n"
"saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x21, x24]\n"
- "ldr q24, [x20, x24]\n"
- "subs x23, x23, #0x1\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -200,17 +200,17 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
- "sxtl v23.8h, v16.8b\n"
- "sxtl2 v22.8h, v16.16b\n"
- "ldr q16, [x20, x26]\n"
+ "subs x23, x23, #0x1\n"
+ "ldr q19, [x20, x27]\n"
+ "ldr q18, [x20, x26]\n"
"ldr q17, [x20, x25]\n"
- "sxtl v21.8h, v16.8b\n"
- "sxtl2 v20.8h, v16.16b\n"
"ldr q16, [x20, x24]\n"
+ "sxtl v23.8h, v19.8b\n"
+ "sxtl2 v22.8h, v19.16b\n"
+ "sxtl v21.8h, v18.8b\n"
+ "sxtl2 v20.8h, v18.16b\n"
"sxtl v19.8h, v17.8b\n"
"sxtl2 v18.8h, v17.16b\n"
- "subs x23, x23, #0x1\n"
"sxtl v17.8h, v16.8b\n"
"sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
@@ -231,44 +231,44 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
+ "movi v17.4s, #0x7f\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "sqdmulh v11.4s, v11.4s, v17.4s\n"
- "sqdmulh v10.4s, v10.4s, v17.4s\n"
- "sqdmulh v9.4s, v9.4s, v17.4s\n"
- "sqdmulh v8.4s, v8.4s, v17.4s\n"
- "sqdmulh v7.4s, v7.4s, v17.4s\n"
- "sqdmulh v6.4s, v6.4s, v17.4s\n"
- "sqdmulh v5.4s, v5.4s, v17.4s\n"
- "sqdmulh v4.4s, v4.4s, v17.4s\n"
- "sqdmulh v3.4s, v3.4s, v17.4s\n"
- "sqdmulh v2.4s, v2.4s, v17.4s\n"
- "sqdmulh v1.4s, v1.4s, v17.4s\n"
- "sqdmulh v0.4s, v0.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v16.4s\n"
- "srshl v8.4s, v8.4s, v16.4s\n"
- "srshl v7.4s, v7.4s, v16.4s\n"
- "srshl v6.4s, v6.4s, v16.4s\n"
- "srshl v5.4s, v5.4s, v16.4s\n"
- "srshl v4.4s, v4.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v16.4s\n"
- "srshl v2.4s, v2.4s, v16.4s\n"
- "srshl v1.4s, v1.4s, v16.4s\n"
- "srshl v0.4s, v0.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "sqdmulh v11.4s, v11.4s, v19.4s\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "sqdmulh v8.4s, v8.4s, v19.4s\n"
+ "sqdmulh v7.4s, v7.4s, v19.4s\n"
+ "sqdmulh v6.4s, v6.4s, v19.4s\n"
+ "sqdmulh v5.4s, v5.4s, v19.4s\n"
+ "sqdmulh v4.4s, v4.4s, v19.4s\n"
+ "sqdmulh v3.4s, v3.4s, v19.4s\n"
+ "sqdmulh v2.4s, v2.4s, v19.4s\n"
+ "sqdmulh v1.4s, v1.4s, v19.4s\n"
+ "sqdmulh v0.4s, v0.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "srshl v8.4s, v8.4s, v18.4s\n"
+ "srshl v7.4s, v7.4s, v18.4s\n"
+ "srshl v6.4s, v6.4s, v18.4s\n"
+ "srshl v5.4s, v5.4s, v18.4s\n"
+ "srshl v4.4s, v4.4s, v18.4s\n"
+ "srshl v3.4s, v3.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v18.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
@@ -302,19 +302,19 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"smin v1.4s, v1.4s, v17.4s\n"
"smin v0.4s, v0.4s, v17.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
- "uzp1 v16.16b, v13.16b, v12.16b\n"
+ "uzp1 v19.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
"uzp1 v18.16b, v9.16b, v8.16b\n"
"uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
"uzp1 v20.16b, v3.16b, v2.16b\n"
- "uzp1 v19.16b, v1.16b, v0.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v1.16b, v0.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
- "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
"str q17, [%x[outptr], x25]\n"
@@ -335,23 +335,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
"saddl v17.8h, v31.8b, v30.8b\n"
"saddl2 v16.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
"ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
- "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v17.4h\n"
"saddw2 v14.4s, v14.4s, v17.8h\n"
"saddw v13.4s, v13.4s, v16.4h\n"
"saddw2 v12.4s, v12.4s, v16.8h\n"
- "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
"saddl v17.8h, v31.8b, v30.8b\n"
@@ -365,30 +365,30 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x23, x23, #0x1\n"
"ldr q16, [x20, x27]\n"
"sxtl v17.8h, v16.8b\n"
"sxtl2 v16.8h, v16.16b\n"
- "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v17.4h\n"
"saddw2 v14.4s, v14.4s, v17.8h\n"
"saddw v13.4s, v13.4s, v16.4h\n"
"saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
+ "movi v17.4s, #0x7f\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
@@ -416,10 +416,10 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"15:" // Oddments: 2 inputs loop
"ldp x21, x20, [x22, #0x0]\n"
"add x22, x22, #0x10\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
+ "add x21, x21, x27\n"
+ "add x20, x20, x27\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -493,8 +493,8 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x21, [x22], #0x8\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
+ "add x21, x21, x27\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -549,18 +549,18 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
"movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 7e62ac1afc..eef399efbc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,11 +65,11 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
- "cmp x16, #0x10\n"
"mov x15, #0x0\n"
+ "mov x14, #0x0\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x12, #0x0\n"
+ "cmp x16, #0x10\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ldp x11, x10, [x21, #0x10]\n"
"ldp x9, x28, [x20, #0x0]\n"
"ldp x27, x26, [x20, #0x10]\n"
@@ -80,14 +80,14 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q30, [x28, x15]\n"
"ldr q29, [x25, x15]\n"
"lsr x20, x16, #0x4\n"
- "sub x16, x16, x20, LSL #4\n"
"ldr q28, [x22, x15]\n"
"ldr q27, [x26, x15]\n"
- "subs x20, x20, #0x1\n"
"ldr q26, [x9, x15]\n"
"ldr q25, [x27, x15]\n"
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
+ "sub x16, x16, x20, LSL #4\n"
+ "subs x20, x20, #0x1\n"
"ldr q22, [x21, x15]\n"
"add x15, x15, #0x10\n"
"beq 2f\n"
@@ -107,62 +107,62 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
"subs x20, x20, #0x1\n"
- "smax v19.16b, v21.16b, v19.16b\n"
"ldr q22, [x21, x15]\n"
+ "smax v19.16b, v21.16b, v19.16b\n"
"smax v18.16b, v18.16b, v21.16b\n"
- "smax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
+ "smax v17.16b, v17.16b, v20.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "str q19, [x14, x12]\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"smax v21.16b, v30.16b, v29.16b\n"
"smax v20.16b, v29.16b, v28.16b\n"
- "smax v16.16b, v27.16b, v26.16b\n"
+ "smax v19.16b, v27.16b, v26.16b\n"
"smax v18.16b, v25.16b, v24.16b\n"
"smax v17.16b, v27.16b, v23.16b\n"
- "smax v19.16b, v24.16b, v22.16b\n"
- "smax v16.16b, v21.16b, v16.16b\n"
+ "smax v16.16b, v24.16b, v22.16b\n"
+ "smax v19.16b, v21.16b, v19.16b\n"
"smax v18.16b, v18.16b, v21.16b\n"
- "str q16, [x14, x12]\n"
"smax v17.16b, v17.16b, v20.16b\n"
- "smax v16.16b, v20.16b, v19.16b\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "smax v16.16b, v20.16b, v16.16b\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
"ldr b16, [x28, x15]\n"
- "ldr b17, [x25, x15]\n"
- "smax v23.16b, v16.16b, v17.16b\n"
+ "ldr b24, [x25, x15]\n"
"subs x16, x16, #0x1\n"
- "ldr b16, [x22, x15]\n"
- "ldr b22, [x26, x15]\n"
- "smax v21.16b, v17.16b, v16.16b\n"
- "ldr b16, [x9, x15]\n"
- "ldr b17, [x27, x15]\n"
- "smax v16.16b, v22.16b, v16.16b\n"
- "smax v20.16b, v23.16b, v16.16b\n"
- "ldr b19, [x24, x15]\n"
- "ldr b16, [x23, x15]\n"
- "smax v18.16b, v17.16b, v19.16b\n"
- "smax v17.16b, v22.16b, v16.16b\n"
+ "ldr b20, [x22, x15]\n"
+ "ldr b23, [x26, x15]\n"
+ "ldr b19, [x9, x15]\n"
+ "ldr b18, [x27, x15]\n"
+ "ldr b22, [x24, x15]\n"
+ "ldr b17, [x23, x15]\n"
+ "smax v21.16b, v16.16b, v24.16b\n"
"ldr b16, [x21, x15]\n"
- "smax v16.16b, v19.16b, v16.16b\n"
+ "smax v20.16b, v24.16b, v20.16b\n"
"add x15, x15, #0x1\n"
- "smax v18.16b, v18.16b, v23.16b\n"
- "smax v17.16b, v17.16b, v21.16b\n"
- "smax v16.16b, v21.16b, v16.16b\n"
- "str b20, [x14, x12]\n"
- "str b18, [x13, x12]\n"
- "str b17, [x11, x12]\n"
- "str b16, [x10, x12]\n"
- "add x12, x12, #0x1\n"
+ "smax v19.16b, v23.16b, v19.16b\n"
+ "smax v18.16b, v18.16b, v22.16b\n"
+ "smax v17.16b, v23.16b, v17.16b\n"
+ "smax v16.16b, v22.16b, v16.16b\n"
+ "smax v19.16b, v21.16b, v19.16b\n"
+ "smax v18.16b, v18.16b, v21.16b\n"
+ "smax v17.16b, v17.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v16.16b\n"
+ "str b19, [x13, x14]\n"
+ "str b18, [x12, x14]\n"
+ "str b17, [x11, x14]\n"
+ "str b16, [x10, x14]\n"
+ "add x14, x14, #0x1\n"
"bgt 3b\n"
"4:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 411fd11460..334d85bfb5 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,121 +41,121 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v9.16b, #0x80\n"
"movi v8.16b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
"movi v7.16b, #0x80\n"
- "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
- "movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x21, x26]\n"
- "smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x20, x26]\n"
- "smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x21, x24]\n"
- "smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x20, x24]\n"
- "smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x21, x23]\n"
+ "smax v23.16b, v5.16b, v4.16b\n"
+ "smax v19.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax v22.16b, v1.16b, v0.16b\n"
+ "smax v18.16b, v31.16b, v30.16b\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "smax v21.16b, v29.16b, v21.16b\n"
+ "smax v17.16b, v28.16b, v27.16b\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "smax v20.16b, v26.16b, v20.16b\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"smax v17.16b, v21.16b, v17.16b\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "smax v7.16b, v7.16b, v18.16b\n"
- "smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "smax v5.16b, v5.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v22.16b, v2.16b, v1.16b\n"
- "smax v18.16b, v27.16b, v21.16b\n"
- "smax v21.16b, v0.16b, v31.16b\n"
- "smax v17.16b, v26.16b, v20.16b\n"
- "smax v20.16b, v30.16b, v29.16b\n"
+ "smax v23.16b, v5.16b, v4.16b\n"
+ "smax v19.16b, v3.16b, v2.16b\n"
+ "smax v22.16b, v1.16b, v0.16b\n"
+ "smax v18.16b, v31.16b, v30.16b\n"
+ "smax v21.16b, v29.16b, v21.16b\n"
+ "smax v17.16b, v28.16b, v27.16b\n"
+ "smax v20.16b, v26.16b, v20.16b\n"
"smax v16.16b, v25.16b, v24.16b\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "smax v7.16b, v7.16b, v18.16b\n"
- "smax v6.16b, v6.16b, v17.16b\n"
- "smax v5.16b, v5.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v16.16b\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
"smax v7.16b, v7.16b, v17.16b\n"
"smax v6.16b, v6.16b, v16.16b\n"
- "ldr q16, [x20, x23]\n"
- "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x27]\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -163,272 +163,272 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x80\n"
- "mov x22, %x[inptrs]\n"
+ "movi v9.16b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "smax v16.16b, v17.16b, v16.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "smax v8.16b, v8.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
"smax v16.16b, v17.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q16, [x20, x9]\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x80\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v9.16b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
- "ld1 { v3.b }[14], [x22], #0x1\n"
- "ld1 { v28.b }[14], [x21], #0x1\n"
- "ld1 { v22.b }[14], [x20], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
+ "ld1 { v3.b }[14], [x21], #0x1\n"
+ "ld1 { v2.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
- "ld1 { v3.b }[12], [x22], #0x1\n"
- "ld1 { v28.b }[12], [x21], #0x1\n"
- "ld1 { v22.b }[12], [x20], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
+ "ld1 { v3.b }[12], [x21], #0x1\n"
+ "ld1 { v2.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
- "ld1 { v3.b }[10], [x22], #0x1\n"
- "ld1 { v28.b }[10], [x21], #0x1\n"
- "ld1 { v22.b }[10], [x20], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
+ "ld1 { v3.b }[10], [x21], #0x1\n"
+ "ld1 { v2.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
- "ld1 { v3.b }[8], [x22], #0x1\n"
- "ld1 { v28.b }[8], [x21], #0x1\n"
- "ld1 { v22.b }[8], [x20], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
+ "ld1 { v3.b }[8], [x21], #0x1\n"
+ "ld1 { v2.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
- "ld1 { v3.b }[6], [x22], #0x1\n"
- "ld1 { v28.b }[6], [x21], #0x1\n"
- "ld1 { v22.b }[6], [x20], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
+ "ld1 { v3.b }[6], [x21], #0x1\n"
+ "ld1 { v2.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
- "ld1 { v3.b }[4], [x22], #0x1\n"
- "ld1 { v28.b }[4], [x21], #0x1\n"
- "ld1 { v22.b }[4], [x20], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
+ "ld1 { v3.b }[4], [x21], #0x1\n"
+ "ld1 { v2.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
- "ld1 { v3.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
- "ld1 { v22.b }[2], [x20], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
+ "ld1 { v3.b }[2], [x21], #0x1\n"
+ "ld1 { v2.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x23], #0x1\n"
- "ldr b3, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
- "ldr b22, [x20], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
+ "ldr b3, [x21], #0x1\n"
+ "ldr b2, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
"subs x25, x25, #0x1\n"
"smax v16.16b, v17.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x23], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v9.16b, v9.16b, v5.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"tbz %x[n_channels], #3, 38f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 35f\n"
- "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[6], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[14], [%x[outptr]], #0x1\n"
"b 42f\n"
"35:" // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[12], [%x[outptr]], #0x1\n"
"b 42f\n"
"36:" // Oddments: Store: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 37f\n"
- "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[4], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[10], [%x[outptr]], #0x1\n"
"b 42f\n"
"37:" // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[8], [%x[outptr]], #0x1\n"
"b 42f\n"
"38:" // Oddments: Store: Bit 3: Unset
"tbz %x[n_channels], #2, 40f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 39f\n"
- "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[2], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[6], [%x[outptr]], #0x1\n"
"b 42f\n"
"39:" // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[4], [%x[outptr]], #0x1\n"
"b 42f\n"
"40:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 41f\n"
- "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[0], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[2], [%x[outptr]], #0x1\n"
"b 42f\n"
"41:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 019f402911..60135a42d5 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -141,9 +141,9 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"ldr q29, [x21, x26]\n"
"ldr q28, [x20, x26]\n"
@@ -156,26 +156,26 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddl v23.8h, v31.8b, v30.8b\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
- "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
"saddl v21.8h, v29.8b, v28.8b\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x21, x26]\n"
- "ldr q28, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
"saddl v19.8h, v27.8b, v26.8b\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x21, x25]\n"
- "ldr q26, [x20, x25]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v17.8h, v25.8b, v24.8b\n"
"saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x21, x24]\n"
- "ldr q24, [x20, x24]\n"
- "subs x23, x23, #0x1\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
- "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -219,17 +219,17 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
- "sxtl v23.8h, v16.8b\n"
- "sxtl2 v22.8h, v16.16b\n"
- "ldr q16, [x20, x26]\n"
+ "subs x23, x23, #0x1\n"
+ "ldr q19, [x20, x27]\n"
+ "ldr q18, [x20, x26]\n"
"ldr q17, [x20, x25]\n"
- "sxtl v21.8h, v16.8b\n"
- "sxtl2 v20.8h, v16.16b\n"
"ldr q16, [x20, x24]\n"
+ "sxtl v23.8h, v19.8b\n"
+ "sxtl2 v22.8h, v19.16b\n"
+ "sxtl v21.8h, v18.8b\n"
+ "sxtl2 v20.8h, v18.16b\n"
"sxtl v19.8h, v17.8b\n"
"sxtl2 v18.8h, v17.16b\n"
- "subs x23, x23, #0x1\n"
"sxtl v17.8h, v16.8b\n"
"sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
@@ -250,61 +250,61 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1r { v18.4s }, [%x[left_shift]]\n"
- "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "ld1r { v16.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
+ "ld1r { v20.4s }, [%x[left_shift]]\n"
+ "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
+ "movi v18.4s, #0x7f\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
- "srshl v11.4s, v11.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
+ "ld1r { v17.4s }, [%x[right_shift]]\n"
"cmp %x[n_channels], #0x40\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "srshl v8.4s, v8.4s, v18.4s\n"
- "srshl v7.4s, v7.4s, v18.4s\n"
- "srshl v6.4s, v6.4s, v18.4s\n"
- "srshl v5.4s, v5.4s, v18.4s\n"
- "srshl v4.4s, v4.4s, v18.4s\n"
- "srshl v3.4s, v3.4s, v18.4s\n"
- "srshl v2.4s, v2.4s, v18.4s\n"
- "srshl v1.4s, v1.4s, v18.4s\n"
- "srshl v0.4s, v0.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v17.4s\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "sqrdmulh v12.4s, v12.4s, v17.4s\n"
- "sqrdmulh v11.4s, v11.4s, v17.4s\n"
- "sqrdmulh v10.4s, v10.4s, v17.4s\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
- "sqrdmulh v7.4s, v7.4s, v17.4s\n"
- "sqrdmulh v6.4s, v6.4s, v17.4s\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "sqrdmulh v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v3.4s, v3.4s, v17.4s\n"
- "sqrdmulh v2.4s, v2.4s, v17.4s\n"
- "sqrdmulh v1.4s, v1.4s, v17.4s\n"
- "sqrdmulh v0.4s, v0.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v16.4s\n"
- "srshl v8.4s, v8.4s, v16.4s\n"
- "srshl v7.4s, v7.4s, v16.4s\n"
- "srshl v6.4s, v6.4s, v16.4s\n"
- "srshl v5.4s, v5.4s, v16.4s\n"
- "srshl v4.4s, v4.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v16.4s\n"
- "srshl v2.4s, v2.4s, v16.4s\n"
- "srshl v1.4s, v1.4s, v16.4s\n"
- "srshl v0.4s, v0.4s, v16.4s\n"
- "not v16.16b, v17.16b\n"
+ "not v16.16b, v18.16b\n"
+ "srshl v15.4s, v15.4s, v20.4s\n"
+ "srshl v14.4s, v14.4s, v20.4s\n"
+ "srshl v13.4s, v13.4s, v20.4s\n"
+ "srshl v12.4s, v12.4s, v20.4s\n"
+ "srshl v11.4s, v11.4s, v20.4s\n"
+ "srshl v10.4s, v10.4s, v20.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "srshl v8.4s, v8.4s, v20.4s\n"
+ "srshl v7.4s, v7.4s, v20.4s\n"
+ "srshl v6.4s, v6.4s, v20.4s\n"
+ "srshl v5.4s, v5.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "srshl v3.4s, v3.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v20.4s\n"
+ "srshl v1.4s, v1.4s, v20.4s\n"
+ "srshl v0.4s, v0.4s, v20.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v19.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v19.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v19.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v19.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v19.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v19.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v19.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v19.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v19.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v19.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v17.4s\n"
+ "srshl v14.4s, v14.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v17.4s\n"
+ "srshl v12.4s, v12.4s, v17.4s\n"
+ "srshl v11.4s, v11.4s, v17.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v8.4s, v8.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "srshl v6.4s, v6.4s, v17.4s\n"
+ "srshl v5.4s, v5.4s, v17.4s\n"
+ "srshl v4.4s, v4.4s, v17.4s\n"
+ "srshl v3.4s, v3.4s, v17.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "srshl v1.4s, v1.4s, v17.4s\n"
+ "srshl v0.4s, v0.4s, v17.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
@@ -321,36 +321,36 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"smax v2.4s, v2.4s, v16.4s\n"
"smax v1.4s, v1.4s, v16.4s\n"
"smax v0.4s, v0.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
- "smin v11.4s, v11.4s, v17.4s\n"
- "smin v10.4s, v10.4s, v17.4s\n"
- "smin v9.4s, v9.4s, v17.4s\n"
- "smin v8.4s, v8.4s, v17.4s\n"
- "smin v7.4s, v7.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v17.4s\n"
- "smin v5.4s, v5.4s, v17.4s\n"
- "smin v4.4s, v4.4s, v17.4s\n"
- "smin v3.4s, v3.4s, v17.4s\n"
- "smin v2.4s, v2.4s, v17.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
+ "smin v15.4s, v15.4s, v18.4s\n"
+ "smin v14.4s, v14.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v18.4s\n"
+ "smin v12.4s, v12.4s, v18.4s\n"
+ "smin v11.4s, v11.4s, v18.4s\n"
+ "smin v10.4s, v10.4s, v18.4s\n"
+ "smin v9.4s, v9.4s, v18.4s\n"
+ "smin v8.4s, v8.4s, v18.4s\n"
+ "smin v7.4s, v7.4s, v18.4s\n"
+ "smin v6.4s, v6.4s, v18.4s\n"
+ "smin v5.4s, v5.4s, v18.4s\n"
+ "smin v4.4s, v4.4s, v18.4s\n"
+ "smin v3.4s, v3.4s, v18.4s\n"
+ "smin v2.4s, v2.4s, v18.4s\n"
+ "smin v1.4s, v1.4s, v18.4s\n"
+ "smin v0.4s, v0.4s, v18.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
- "uzp1 v16.16b, v13.16b, v12.16b\n"
+ "uzp1 v19.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
"uzp1 v18.16b, v9.16b, v8.16b\n"
"uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
"uzp1 v20.16b, v3.16b, v2.16b\n"
- "uzp1 v19.16b, v1.16b, v0.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v1.16b, v0.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
- "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
"str q17, [%x[outptr], x25]\n"
@@ -371,23 +371,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
"saddl v17.8h, v31.8b, v30.8b\n"
"saddl2 v16.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
"ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
- "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v17.4h\n"
"saddw2 v14.4s, v14.4s, v17.8h\n"
"saddw v13.4s, v13.4s, v16.4h\n"
"saddw2 v12.4s, v12.4s, v16.8h\n"
- "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
"saddl v17.8h, v31.8b, v30.8b\n"
@@ -401,43 +401,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x23, x23, #0x1\n"
"ldr q16, [x20, x27]\n"
"sxtl v17.8h, v16.8b\n"
"sxtl2 v16.8h, v16.16b\n"
- "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v17.4h\n"
"saddw2 v14.4s, v14.4s, v17.8h\n"
"saddw v13.4s, v13.4s, v16.4h\n"
"saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v18.4s }, [%x[left_shift]]\n"
- "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "ld1r { v16.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
+ "ld1r { v20.4s }, [%x[left_shift]]\n"
+ "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
+ "movi v18.4s, #0x7f\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "sqrdmulh v15.4s, v15.4s, v17.4s\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+ "ld1r { v17.4s }, [%x[right_shift]]\n"
"cmp %x[n_channels], #0x10\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "sqrdmulh v12.4s, v12.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "not v16.16b, v17.16b\n"
+ "not v16.16b, v18.16b\n"
+ "srshl v15.4s, v15.4s, v20.4s\n"
+ "srshl v14.4s, v14.4s, v20.4s\n"
+ "srshl v13.4s, v13.4s, v20.4s\n"
+ "srshl v12.4s, v12.4s, v20.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v17.4s\n"
+ "srshl v14.4s, v14.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v17.4s\n"
+ "srshl v12.4s, v12.4s, v17.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v15.4s, v15.4s, v18.4s\n"
+ "smin v14.4s, v14.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v18.4s\n"
+ "smin v12.4s, v12.4s, v18.4s\n"
"uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
@@ -457,10 +457,10 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"15:" // Oddments: 2 inputs loop
"ldp x21, x20, [x22, #0x0]\n"
"add x22, x22, #0x10\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
+ "add x21, x21, x27\n"
+ "add x20, x20, x27\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -534,8 +534,8 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x21, [x22], #0x8\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
+ "add x21, x21, x27\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -590,31 +590,31 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v18.4s }, [%x[left_shift]]\n"
- "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "ld1r { v16.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v17.4s\n"
- "sqrdmulh v14.4s, v14.4s, v17.4s\n"
- "sqrdmulh v13.4s, v13.4s, v17.4s\n"
- "sqrdmulh v12.4s, v12.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "not v16.16b, v17.16b\n"
+ "ld1r { v20.4s }, [%x[left_shift]]\n"
+ "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
+ "movi v18.4s, #0x7f\n"
+ "ld1r { v17.4s }, [%x[right_shift]]\n"
+ "not v16.16b, v18.16b\n"
+ "srshl v15.4s, v15.4s, v20.4s\n"
+ "srshl v14.4s, v14.4s, v20.4s\n"
+ "srshl v13.4s, v13.4s, v20.4s\n"
+ "srshl v12.4s, v12.4s, v20.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v17.4s\n"
+ "srshl v14.4s, v14.4s, v17.4s\n"
+ "srshl v13.4s, v13.4s, v17.4s\n"
+ "srshl v12.4s, v12.4s, v17.4s\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v15.4s, v15.4s, v18.4s\n"
+ "smin v14.4s, v14.4s, v18.4s\n"
+ "smin v13.4s, v13.4s, v18.4s\n"
+ "smin v12.4s, v12.4s, v18.4s\n"
"uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index f7b8dc761c..797a8f9235 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,245 +43,245 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v9.16b, #0x80\n"
"movi v8.16b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
"movi v7.16b, #0x80\n"
- "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
- "movi v5.16b, #0x80\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "smax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x21, x26]\n"
- "smax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x20, x26]\n"
- "smax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x21, x24]\n"
- "smax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x20, x24]\n"
- "smax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x21, x23]\n"
+ "smax v23.16b, v5.16b, v4.16b\n"
+ "smax v19.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax v22.16b, v1.16b, v0.16b\n"
+ "smax v18.16b, v31.16b, v30.16b\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "smax v21.16b, v29.16b, v21.16b\n"
+ "smax v17.16b, v28.16b, v27.16b\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "smax v20.16b, v26.16b, v20.16b\n"
"smax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"smax v17.16b, v21.16b, v17.16b\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "smax v7.16b, v7.16b, v18.16b\n"
- "smax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "smax v5.16b, v5.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "smax v23.16b, v4.16b, v3.16b\n"
- "smax v19.16b, v28.16b, v22.16b\n"
- "smax v22.16b, v2.16b, v1.16b\n"
- "smax v18.16b, v27.16b, v21.16b\n"
- "smax v21.16b, v0.16b, v31.16b\n"
- "smax v17.16b, v26.16b, v20.16b\n"
- "smax v20.16b, v30.16b, v29.16b\n"
+ "smax v23.16b, v5.16b, v4.16b\n"
+ "smax v19.16b, v3.16b, v2.16b\n"
+ "smax v22.16b, v1.16b, v0.16b\n"
+ "smax v18.16b, v31.16b, v30.16b\n"
+ "smax v21.16b, v29.16b, v21.16b\n"
+ "smax v17.16b, v28.16b, v27.16b\n"
+ "smax v20.16b, v26.16b, v20.16b\n"
"smax v16.16b, v25.16b, v24.16b\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "smax v7.16b, v7.16b, v18.16b\n"
- "smax v6.16b, v6.16b, v17.16b\n"
- "smax v5.16b, v5.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v16.16b\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "smax v9.16b, v9.16b, v19.16b\n"
+ "smax v8.16b, v8.16b, v18.16b\n"
"smax v7.16b, v7.16b, v17.16b\n"
"smax v6.16b, v6.16b, v16.16b\n"
- "ldr q16, [x20, x23]\n"
- "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "sxtl2 v22.8h, v8.16b\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v4.4s }, [x20]\n"
- "sxtl v21.8h, v7.8b\n"
- "sxtl2 v18.8h, v7.16b\n"
+ "sxtl v23.8h, v9.8b\n"
+ "sxtl2 v19.8h, v9.16b\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v4.4s }, [x21]\n"
"ld1r { v3.4s }, [x20]\n"
- "sxtl v20.8h, v6.8b\n"
- "sxtl2 v19.8h, v6.16b\n"
+ "sxtl v22.8h, v8.8b\n"
+ "sxtl2 v18.8h, v8.16b\n"
+ "sxtl v21.8h, v7.8b\n"
+ "sxtl2 v20.8h, v7.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v2.4s }, [x20]\n"
- "sxtl v17.8h, v5.8b\n"
- "sxtl2 v16.8h, v5.16b\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "sxtl v17.8h, v6.8b\n"
+ "sxtl2 v16.8h, v6.16b\n"
"cmp %x[n_channels], #0x40\n"
"sxtl v1.4s, v23.4h\n"
"sxtl2 v23.4s, v23.8h\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
- "sxtl v30.4s, v21.4h\n"
- "sxtl2 v22.4s, v21.8h\n"
- "sxtl v29.4s, v18.4h\n"
+ "sxtl v0.4s, v19.4h\n"
+ "sxtl2 v19.4s, v19.8h\n"
+ "sxtl v31.4s, v22.4h\n"
+ "sxtl2 v22.4s, v22.8h\n"
+ "sxtl v30.4s, v18.4h\n"
"sxtl2 v18.4s, v18.8h\n"
+ "sxtl v29.4s, v21.4h\n"
+ "sxtl2 v21.4s, v21.8h\n"
"sxtl v28.4s, v20.4h\n"
- "sxtl2 v21.4s, v20.8h\n"
- "sxtl v27.4s, v19.4h\n"
- "sxtl2 v26.4s, v19.8h\n"
- "sxtl v25.4s, v17.4h\n"
+ "sxtl2 v27.4s, v20.8h\n"
+ "sxtl v26.4s, v17.4h\n"
"sxtl2 v20.4s, v17.8h\n"
- "sxtl v24.4s, v16.4h\n"
- "sxtl2 v19.4s, v16.8h\n"
+ "sxtl v25.4s, v16.4h\n"
+ "sxtl2 v24.4s, v16.8h\n"
"srshl v1.4s, v1.4s, v4.4s\n"
"srshl v23.4s, v23.4s, v4.4s\n"
"srshl v0.4s, v0.4s, v4.4s\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
"srshl v31.4s, v31.4s, v4.4s\n"
- "srshl v30.4s, v30.4s, v4.4s\n"
"srshl v22.4s, v22.4s, v4.4s\n"
- "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
"srshl v18.4s, v18.4s, v4.4s\n"
- "srshl v28.4s, v28.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
"srshl v21.4s, v21.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
"srshl v27.4s, v27.4s, v4.4s\n"
"srshl v26.4s, v26.4s, v4.4s\n"
- "srshl v25.4s, v25.4s, v4.4s\n"
"srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
"srshl v24.4s, v24.4s, v4.4s\n"
- "srshl v19.4s, v19.4s, v4.4s\n"
"sqrdmulh v1.4s, v1.4s, v3.4s\n"
"sqrdmulh v23.4s, v23.4s, v3.4s\n"
"sqrdmulh v0.4s, v0.4s, v3.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v3.4s\n"
"sqrdmulh v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v30.4s, v30.4s, v3.4s\n"
"sqrdmulh v22.4s, v22.4s, v3.4s\n"
- "sqrdmulh v29.4s, v29.4s, v3.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v3.4s\n"
"sqrdmulh v18.4s, v18.4s, v3.4s\n"
- "sqrdmulh v28.4s, v28.4s, v3.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v3.4s\n"
"sqrdmulh v21.4s, v21.4s, v3.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v3.4s\n"
"sqrdmulh v27.4s, v27.4s, v3.4s\n"
"sqrdmulh v26.4s, v26.4s, v3.4s\n"
- "sqrdmulh v25.4s, v25.4s, v3.4s\n"
"sqrdmulh v20.4s, v20.4s, v3.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v3.4s\n"
"sqrdmulh v24.4s, v24.4s, v3.4s\n"
- "sqrdmulh v19.4s, v19.4s, v3.4s\n"
"movi v17.4s, #0x7f\n"
"srshl v1.4s, v1.4s, v2.4s\n"
"srshl v23.4s, v23.4s, v2.4s\n"
"srshl v0.4s, v0.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v2.4s\n"
"srshl v31.4s, v31.4s, v2.4s\n"
- "srshl v30.4s, v30.4s, v2.4s\n"
"srshl v22.4s, v22.4s, v2.4s\n"
- "srshl v29.4s, v29.4s, v2.4s\n"
+ "srshl v30.4s, v30.4s, v2.4s\n"
"srshl v18.4s, v18.4s, v2.4s\n"
- "srshl v28.4s, v28.4s, v2.4s\n"
+ "srshl v29.4s, v29.4s, v2.4s\n"
"srshl v21.4s, v21.4s, v2.4s\n"
+ "srshl v28.4s, v28.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v2.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
- "srshl v25.4s, v25.4s, v2.4s\n"
"srshl v20.4s, v20.4s, v2.4s\n"
+ "srshl v25.4s, v25.4s, v2.4s\n"
"srshl v24.4s, v24.4s, v2.4s\n"
- "srshl v19.4s, v19.4s, v2.4s\n"
"not v16.16b, v17.16b\n"
"smax v1.4s, v1.4s, v16.4s\n"
"smax v23.4s, v23.4s, v16.4s\n"
"smax v0.4s, v0.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
"smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
"smax v22.4s, v22.4s, v16.4s\n"
- "smax v29.4s, v29.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v16.4s\n"
"smax v18.4s, v18.4s, v16.4s\n"
- "smax v28.4s, v28.4s, v16.4s\n"
+ "smax v29.4s, v29.4s, v16.4s\n"
"smax v21.4s, v21.4s, v16.4s\n"
+ "smax v28.4s, v28.4s, v16.4s\n"
"smax v27.4s, v27.4s, v16.4s\n"
"smax v26.4s, v26.4s, v16.4s\n"
- "smax v25.4s, v25.4s, v16.4s\n"
"smax v20.4s, v20.4s, v16.4s\n"
+ "smax v25.4s, v25.4s, v16.4s\n"
"smax v24.4s, v24.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v16.4s\n"
"smin v1.4s, v1.4s, v17.4s\n"
"smin v23.4s, v23.4s, v17.4s\n"
"smin v0.4s, v0.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
"smin v31.4s, v31.4s, v17.4s\n"
- "smin v30.4s, v30.4s, v17.4s\n"
"smin v22.4s, v22.4s, v17.4s\n"
- "smin v29.4s, v29.4s, v17.4s\n"
+ "smin v30.4s, v30.4s, v17.4s\n"
"smin v18.4s, v18.4s, v17.4s\n"
- "smin v28.4s, v28.4s, v17.4s\n"
+ "smin v29.4s, v29.4s, v17.4s\n"
"smin v21.4s, v21.4s, v17.4s\n"
+ "smin v28.4s, v28.4s, v17.4s\n"
"smin v27.4s, v27.4s, v17.4s\n"
"smin v26.4s, v26.4s, v17.4s\n"
- "smin v25.4s, v25.4s, v17.4s\n"
"smin v20.4s, v20.4s, v17.4s\n"
+ "smin v25.4s, v25.4s, v17.4s\n"
"smin v24.4s, v24.4s, v17.4s\n"
- "smin v19.4s, v19.4s, v17.4s\n"
"uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v22.16b, v30.16b, v22.16b\n"
- "uzp1 v18.16b, v29.16b, v18.16b\n"
- "uzp1 v21.16b, v28.16b, v21.16b\n"
- "uzp1 v17.16b, v27.16b, v26.16b\n"
- "uzp1 v20.16b, v25.16b, v20.16b\n"
- "uzp1 v19.16b, v24.16b, v19.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v19.16b, v0.16b, v19.16b\n"
+ "uzp1 v22.16b, v31.16b, v22.16b\n"
+ "uzp1 v18.16b, v30.16b, v18.16b\n"
+ "uzp1 v21.16b, v29.16b, v21.16b\n"
+ "uzp1 v17.16b, v28.16b, v27.16b\n"
+ "uzp1 v20.16b, v26.16b, v20.16b\n"
+ "uzp1 v16.16b, v25.16b, v24.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
- "uzp1 v16.16b, v20.16b, v19.16b\n"
- "str q18, [%x[outptr], x26]\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
+ "str q18, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q17, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "str q16, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q17, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -289,295 +289,295 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x80\n"
- "mov x22, %x[inptrs]\n"
+ "movi v9.16b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "smax v16.16b, v17.16b, v16.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "smax v8.16b, v8.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
"smax v16.16b, v17.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q16, [x20, x9]\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "sxtl v17.8h, v8.8b\n"
- "sxtl2 v16.8h, v8.16b\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v22.4s }, [x20]\n"
- "sxtl v21.4s, v17.4h\n"
- "sxtl2 v20.4s, v17.8h\n"
+ "sxtl v17.8h, v9.8b\n"
+ "sxtl2 v16.8h, v9.16b\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v17.4s }, [x20]\n"
- "sxtl v19.4s, v16.4h\n"
- "sxtl2 v18.4s, v16.8h\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v23.4s }, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v16.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "srshl v20.4s, v20.4s, v22.4s\n"
+ "movi v22.4s, #0x7f\n"
+ "ld1r { v21.4s }, [x20]\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "sxtl v20.4s, v17.4h\n"
+ "sxtl2 v17.4s, v17.8h\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "srshl v18.4s, v18.4s, v22.4s\n"
- "sqrdmulh v21.4s, v21.4s, v17.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v19.4s, v19.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v21.4s, v21.4s, v16.4s\n"
- "srshl v20.4s, v20.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v16.4s\n"
- "srshl v18.4s, v18.4s, v16.4s\n"
- "not v16.16b, v17.16b\n"
- "smax v21.4s, v21.4s, v16.4s\n"
+ "not v16.16b, v22.16b\n"
+ "srshl v20.4s, v20.4s, v24.4s\n"
+ "srshl v17.4s, v17.4s, v24.4s\n"
+ "srshl v19.4s, v19.4s, v24.4s\n"
+ "srshl v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v17.4s, v17.4s, v21.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
"smax v20.4s, v20.4s, v16.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
"smax v19.4s, v19.4s, v16.4s\n"
"smax v18.4s, v18.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v17.4s\n"
- "smin v20.4s, v20.4s, v17.4s\n"
- "smin v19.4s, v19.4s, v17.4s\n"
- "smin v18.4s, v18.4s, v17.4s\n"
- "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "smin v20.4s, v20.4s, v22.4s\n"
+ "smin v17.4s, v17.4s, v22.4s\n"
+ "smin v19.4s, v19.4s, v22.4s\n"
+ "smin v18.4s, v18.4s, v22.4s\n"
+ "uzp1 v17.16b, v20.16b, v17.16b\n"
"uzp1 v16.16b, v19.16b, v18.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
+ "str q16, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x80\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v9.16b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
- "ld1 { v3.b }[14], [x22], #0x1\n"
- "ld1 { v28.b }[14], [x21], #0x1\n"
- "ld1 { v22.b }[14], [x20], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
+ "ld1 { v3.b }[14], [x21], #0x1\n"
+ "ld1 { v2.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
- "ld1 { v3.b }[12], [x22], #0x1\n"
- "ld1 { v28.b }[12], [x21], #0x1\n"
- "ld1 { v22.b }[12], [x20], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
+ "ld1 { v3.b }[12], [x21], #0x1\n"
+ "ld1 { v2.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
- "ld1 { v3.b }[10], [x22], #0x1\n"
- "ld1 { v28.b }[10], [x21], #0x1\n"
- "ld1 { v22.b }[10], [x20], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
+ "ld1 { v3.b }[10], [x21], #0x1\n"
+ "ld1 { v2.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
- "ld1 { v3.b }[8], [x22], #0x1\n"
- "ld1 { v28.b }[8], [x21], #0x1\n"
- "ld1 { v22.b }[8], [x20], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
+ "ld1 { v3.b }[8], [x21], #0x1\n"
+ "ld1 { v2.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
- "ld1 { v3.b }[6], [x22], #0x1\n"
- "ld1 { v28.b }[6], [x21], #0x1\n"
- "ld1 { v22.b }[6], [x20], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
+ "ld1 { v3.b }[6], [x21], #0x1\n"
+ "ld1 { v2.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
- "ld1 { v3.b }[4], [x22], #0x1\n"
- "ld1 { v28.b }[4], [x21], #0x1\n"
- "ld1 { v22.b }[4], [x20], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
+ "ld1 { v3.b }[4], [x21], #0x1\n"
+ "ld1 { v2.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
- "ld1 { v3.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
- "ld1 { v22.b }[2], [x20], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
+ "ld1 { v3.b }[2], [x21], #0x1\n"
+ "ld1 { v2.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x23], #0x1\n"
- "ldr b3, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
- "ldr b22, [x20], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
+ "ldr b3, [x21], #0x1\n"
+ "ldr b2, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v17.16b, v4.16b, v3.16b\n"
- "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v17.16b, v5.16b, v4.16b\n"
+ "smax v16.16b, v3.16b, v2.16b\n"
"subs x25, x25, #0x1\n"
"smax v16.16b, v17.16b, v16.16b\n"
- "smax v8.16b, v8.16b, v16.16b\n"
+ "smax v9.16b, v9.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x23], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
- "smax v8.16b, v8.16b, v4.16b\n"
+ "smax v9.16b, v9.16b, v5.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "sxtl v17.8h, v8.8b\n"
- "sxtl2 v16.8h, v8.16b\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v22.4s }, [x20]\n"
- "sxtl v21.4s, v17.4h\n"
- "sxtl2 v20.4s, v17.8h\n"
+ "sxtl v17.8h, v9.8b\n"
+ "sxtl2 v16.8h, v9.16b\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "movi v22.4s, #0x7f\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "sxtl v20.4s, v17.4h\n"
+ "sxtl2 v17.4s, v17.8h\n"
"sxtl v19.4s, v16.4h\n"
"sxtl2 v18.4s, v16.8h\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v16.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v22.4s\n"
- "srshl v20.4s, v20.4s, v22.4s\n"
- "srshl v19.4s, v19.4s, v22.4s\n"
- "srshl v18.4s, v18.4s, v22.4s\n"
- "sqrdmulh v21.4s, v21.4s, v17.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v19.4s, v19.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v17.4s\n"
- "movi v17.4s, #0x7f\n"
- "srshl v21.4s, v21.4s, v16.4s\n"
- "srshl v20.4s, v20.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v16.4s\n"
- "srshl v18.4s, v18.4s, v16.4s\n"
- "not v16.16b, v17.16b\n"
- "smax v21.4s, v21.4s, v16.4s\n"
+ "not v16.16b, v22.16b\n"
+ "srshl v20.4s, v20.4s, v24.4s\n"
+ "srshl v17.4s, v17.4s, v24.4s\n"
+ "srshl v19.4s, v19.4s, v24.4s\n"
+ "srshl v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v23.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v23.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v23.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v21.4s\n"
+ "srshl v17.4s, v17.4s, v21.4s\n"
+ "srshl v19.4s, v19.4s, v21.4s\n"
+ "srshl v18.4s, v18.4s, v21.4s\n"
"smax v20.4s, v20.4s, v16.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
"smax v19.4s, v19.4s, v16.4s\n"
"smax v18.4s, v18.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v17.4s\n"
- "smin v20.4s, v20.4s, v17.4s\n"
- "smin v19.4s, v19.4s, v17.4s\n"
- "smin v18.4s, v18.4s, v17.4s\n"
- "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "smin v20.4s, v20.4s, v22.4s\n"
+ "smin v17.4s, v17.4s, v22.4s\n"
+ "smin v19.4s, v19.4s, v22.4s\n"
+ "smin v18.4s, v18.4s, v22.4s\n"
+ "uzp1 v17.16b, v20.16b, v17.16b\n"
"uzp1 v16.16b, v19.16b, v18.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
@@ -628,7 +628,7 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index f8984c451c..dbbf4ae2b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -122,9 +122,9 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v0.4s, #0x0\n"
"cbz x23, 4f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"ldr q29, [x21, x26]\n"
"ldr q28, [x20, x26]\n"
@@ -137,26 +137,26 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
- "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x21, x26]\n"
- "ldr q28, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x21, x25]\n"
- "ldr q26, [x20, x25]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v17.8h, v25.8b, v24.8b\n"
"uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x21, x24]\n"
- "ldr q24, [x20, x24]\n"
- "subs x23, x23, #0x1\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -200,17 +200,17 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
- "uxtl v23.8h, v16.8b\n"
- "uxtl2 v22.8h, v16.16b\n"
- "ldr q16, [x20, x26]\n"
+ "subs x23, x23, #0x1\n"
+ "ldr q19, [x20, x27]\n"
+ "ldr q18, [x20, x26]\n"
"ldr q17, [x20, x25]\n"
- "uxtl v21.8h, v16.8b\n"
- "uxtl2 v20.8h, v16.16b\n"
"ldr q16, [x20, x24]\n"
+ "uxtl v23.8h, v19.8b\n"
+ "uxtl2 v22.8h, v19.16b\n"
+ "uxtl v21.8h, v18.8b\n"
+ "uxtl2 v20.8h, v18.16b\n"
"uxtl v19.8h, v17.8b\n"
"uxtl2 v18.8h, v17.16b\n"
- "subs x23, x23, #0x1\n"
"uxtl v17.8h, v16.8b\n"
"uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
@@ -231,60 +231,60 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0xff\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "sqdmulh v11.4s, v11.4s, v17.4s\n"
- "sqdmulh v10.4s, v10.4s, v17.4s\n"
- "sqdmulh v9.4s, v9.4s, v17.4s\n"
- "sqdmulh v8.4s, v8.4s, v17.4s\n"
- "sqdmulh v7.4s, v7.4s, v17.4s\n"
- "sqdmulh v6.4s, v6.4s, v17.4s\n"
- "sqdmulh v5.4s, v5.4s, v17.4s\n"
- "sqdmulh v4.4s, v4.4s, v17.4s\n"
- "sqdmulh v3.4s, v3.4s, v17.4s\n"
- "sqdmulh v2.4s, v2.4s, v17.4s\n"
- "sqdmulh v1.4s, v1.4s, v17.4s\n"
- "sqdmulh v0.4s, v0.4s, v17.4s\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "srshl v11.4s, v11.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v16.4s\n"
- "srshl v8.4s, v8.4s, v16.4s\n"
- "srshl v7.4s, v7.4s, v16.4s\n"
- "srshl v6.4s, v6.4s, v16.4s\n"
- "srshl v5.4s, v5.4s, v16.4s\n"
- "srshl v4.4s, v4.4s, v16.4s\n"
- "srshl v3.4s, v3.4s, v16.4s\n"
- "srshl v2.4s, v2.4s, v16.4s\n"
- "srshl v1.4s, v1.4s, v16.4s\n"
- "srshl v0.4s, v0.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "smax v7.4s, v7.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v16.4s\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "smax v4.4s, v4.4s, v16.4s\n"
- "smax v3.4s, v3.4s, v16.4s\n"
- "smax v2.4s, v2.4s, v16.4s\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "sqdmulh v11.4s, v11.4s, v19.4s\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "sqdmulh v8.4s, v8.4s, v19.4s\n"
+ "sqdmulh v7.4s, v7.4s, v19.4s\n"
+ "sqdmulh v6.4s, v6.4s, v19.4s\n"
+ "sqdmulh v5.4s, v5.4s, v19.4s\n"
+ "sqdmulh v4.4s, v4.4s, v19.4s\n"
+ "sqdmulh v3.4s, v3.4s, v19.4s\n"
+ "sqdmulh v2.4s, v2.4s, v19.4s\n"
+ "sqdmulh v1.4s, v1.4s, v19.4s\n"
+ "sqdmulh v0.4s, v0.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "srshl v8.4s, v8.4s, v18.4s\n"
+ "srshl v7.4s, v7.4s, v18.4s\n"
+ "srshl v6.4s, v6.4s, v18.4s\n"
+ "srshl v5.4s, v5.4s, v18.4s\n"
+ "srshl v4.4s, v4.4s, v18.4s\n"
+ "srshl v3.4s, v3.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v18.4s\n"
+ "smax v15.4s, v15.4s, v17.4s\n"
+ "smax v14.4s, v14.4s, v17.4s\n"
+ "smax v13.4s, v13.4s, v17.4s\n"
+ "smax v12.4s, v12.4s, v17.4s\n"
+ "smax v11.4s, v11.4s, v17.4s\n"
+ "smax v10.4s, v10.4s, v17.4s\n"
+ "smax v9.4s, v9.4s, v17.4s\n"
+ "smax v8.4s, v8.4s, v17.4s\n"
+ "smax v7.4s, v7.4s, v17.4s\n"
+ "smax v6.4s, v6.4s, v17.4s\n"
+ "smax v5.4s, v5.4s, v17.4s\n"
+ "smax v4.4s, v4.4s, v17.4s\n"
+ "smax v3.4s, v3.4s, v17.4s\n"
+ "smax v2.4s, v2.4s, v17.4s\n"
+ "smax v1.4s, v1.4s, v17.4s\n"
+ "smax v0.4s, v0.4s, v17.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
@@ -302,19 +302,19 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"smin v1.4s, v1.4s, v16.4s\n"
"smin v0.4s, v0.4s, v16.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
- "uzp1 v16.16b, v13.16b, v12.16b\n"
+ "uzp1 v19.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
"uzp1 v18.16b, v9.16b, v8.16b\n"
"uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
"uzp1 v20.16b, v3.16b, v2.16b\n"
- "uzp1 v19.16b, v1.16b, v0.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v1.16b, v0.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
- "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
"str q17, [%x[outptr], x25]\n"
@@ -335,23 +335,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v12.4s, #0x0\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
"uaddl v17.8h, v31.8b, v30.8b\n"
"uaddl2 v16.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
"ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
- "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v17.4h\n"
"uaddw2 v14.4s, v14.4s, v17.8h\n"
"uaddw v13.4s, v13.4s, v16.4h\n"
"uaddw2 v12.4s, v12.4s, v16.8h\n"
- "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
"uaddl v17.8h, v31.8b, v30.8b\n"
@@ -365,34 +365,34 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x23, x23, #0x1\n"
"ldr q16, [x20, x27]\n"
"uxtl v17.8h, v16.8b\n"
"uxtl2 v16.8h, v16.16b\n"
- "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v17.4h\n"
"uaddw2 v14.4s, v14.4s, v17.8h\n"
"uaddw v13.4s, v13.4s, v16.4h\n"
"uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "movi v16.4s, #0xff\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "smax v15.4s, v15.4s, v17.4s\n"
+ "smax v14.4s, v14.4s, v17.4s\n"
+ "smax v13.4s, v13.4s, v17.4s\n"
+ "smax v12.4s, v12.4s, v17.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
@@ -416,10 +416,10 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"15:" // Oddments: 2 inputs loop
"ldp x21, x20, [x22, #0x0]\n"
"add x22, x22, #0x10\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
+ "add x21, x21, x27\n"
+ "add x20, x20, x27\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -493,8 +493,8 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x21, [x22], #0x8\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
+ "add x21, x21, x27\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -549,22 +549,22 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
- "ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v17.4s\n"
- "sqdmulh v14.4s, v14.4s, v17.4s\n"
- "sqdmulh v13.4s, v13.4s, v17.4s\n"
- "sqdmulh v12.4s, v12.4s, v17.4s\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
+ "ld1r { v19.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v18.4s }, [%x[shift_ptr]]\n"
+ "movi v17.4s, #0x0\n"
"movi v16.4s, #0xff\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v14.4s, v14.4s, v19.4s\n"
+ "sqdmulh v13.4s, v13.4s, v19.4s\n"
+ "sqdmulh v12.4s, v12.4s, v19.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "smax v15.4s, v15.4s, v17.4s\n"
+ "smax v14.4s, v14.4s, v17.4s\n"
+ "smax v13.4s, v13.4s, v17.4s\n"
+ "smax v12.4s, v12.4s, v17.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 66cdb7f849..d12733c7de 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -65,11 +65,11 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
__asm__ __volatile__(
"ldr x16, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
- "cmp x16, #0x10\n"
"mov x15, #0x0\n"
+ "mov x14, #0x0\n"
"ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "mov x12, #0x0\n"
+ "cmp x16, #0x10\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ldp x11, x10, [x21, #0x10]\n"
"ldp x9, x28, [x20, #0x0]\n"
"ldp x27, x26, [x20, #0x10]\n"
@@ -80,14 +80,14 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q30, [x28, x15]\n"
"ldr q29, [x25, x15]\n"
"lsr x20, x16, #0x4\n"
- "sub x16, x16, x20, LSL #4\n"
"ldr q28, [x22, x15]\n"
"ldr q27, [x26, x15]\n"
- "subs x20, x20, #0x1\n"
"ldr q26, [x9, x15]\n"
"ldr q25, [x27, x15]\n"
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
+ "sub x16, x16, x20, LSL #4\n"
+ "subs x20, x20, #0x1\n"
"ldr q22, [x21, x15]\n"
"add x15, x15, #0x10\n"
"beq 2f\n"
@@ -107,62 +107,62 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr q24, [x24, x15]\n"
"ldr q23, [x23, x15]\n"
"subs x20, x20, #0x1\n"
- "umax v19.16b, v21.16b, v19.16b\n"
"ldr q22, [x21, x15]\n"
+ "umax v19.16b, v21.16b, v19.16b\n"
"umax v18.16b, v18.16b, v21.16b\n"
- "umax v17.16b, v17.16b, v20.16b\n"
"add x15, x15, #0x10\n"
+ "umax v17.16b, v17.16b, v20.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "str q19, [x14, x12]\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"umax v21.16b, v30.16b, v29.16b\n"
"umax v20.16b, v29.16b, v28.16b\n"
- "umax v16.16b, v27.16b, v26.16b\n"
+ "umax v19.16b, v27.16b, v26.16b\n"
"umax v18.16b, v25.16b, v24.16b\n"
"umax v17.16b, v27.16b, v23.16b\n"
- "umax v19.16b, v24.16b, v22.16b\n"
- "umax v16.16b, v21.16b, v16.16b\n"
+ "umax v16.16b, v24.16b, v22.16b\n"
+ "umax v19.16b, v21.16b, v19.16b\n"
"umax v18.16b, v18.16b, v21.16b\n"
- "str q16, [x14, x12]\n"
"umax v17.16b, v17.16b, v20.16b\n"
- "umax v16.16b, v20.16b, v19.16b\n"
- "str q18, [x13, x12]\n"
- "str q17, [x11, x12]\n"
- "str q16, [x10, x12]\n"
- "add x12, x12, #0x10\n"
+ "umax v16.16b, v20.16b, v16.16b\n"
+ "str q19, [x13, x14]\n"
+ "str q18, [x12, x14]\n"
+ "str q17, [x11, x14]\n"
+ "str q16, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
"cbz x16, 4f\n"
"3:" // Oddments
"ldr b16, [x28, x15]\n"
- "ldr b17, [x25, x15]\n"
- "umax v23.16b, v16.16b, v17.16b\n"
+ "ldr b24, [x25, x15]\n"
"subs x16, x16, #0x1\n"
- "ldr b16, [x22, x15]\n"
- "ldr b22, [x26, x15]\n"
- "umax v21.16b, v17.16b, v16.16b\n"
- "ldr b16, [x9, x15]\n"
- "ldr b17, [x27, x15]\n"
- "umax v16.16b, v22.16b, v16.16b\n"
- "umax v20.16b, v23.16b, v16.16b\n"
- "ldr b19, [x24, x15]\n"
- "ldr b16, [x23, x15]\n"
- "umax v18.16b, v17.16b, v19.16b\n"
- "umax v17.16b, v22.16b, v16.16b\n"
+ "ldr b20, [x22, x15]\n"
+ "ldr b23, [x26, x15]\n"
+ "ldr b19, [x9, x15]\n"
+ "ldr b18, [x27, x15]\n"
+ "ldr b22, [x24, x15]\n"
+ "ldr b17, [x23, x15]\n"
+ "umax v21.16b, v16.16b, v24.16b\n"
"ldr b16, [x21, x15]\n"
- "umax v16.16b, v19.16b, v16.16b\n"
+ "umax v20.16b, v24.16b, v20.16b\n"
"add x15, x15, #0x1\n"
- "umax v18.16b, v18.16b, v23.16b\n"
- "umax v17.16b, v17.16b, v21.16b\n"
- "umax v16.16b, v21.16b, v16.16b\n"
- "str b20, [x14, x12]\n"
- "str b18, [x13, x12]\n"
- "str b17, [x11, x12]\n"
- "str b16, [x10, x12]\n"
- "add x12, x12, #0x1\n"
+ "umax v19.16b, v23.16b, v19.16b\n"
+ "umax v18.16b, v18.16b, v22.16b\n"
+ "umax v17.16b, v23.16b, v17.16b\n"
+ "umax v16.16b, v22.16b, v16.16b\n"
+ "umax v19.16b, v21.16b, v19.16b\n"
+ "umax v18.16b, v18.16b, v21.16b\n"
+ "umax v17.16b, v17.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v16.16b\n"
+ "str b19, [x13, x14]\n"
+ "str b18, [x12, x14]\n"
+ "str b17, [x11, x14]\n"
+ "str b16, [x10, x14]\n"
+ "add x14, x14, #0x1\n"
"bgt 3b\n"
"4:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index 2ceef125ca..bf6335b71a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,121 +41,121 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v9.16b, #0x0\n"
"movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"movi v7.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x21, x26]\n"
- "umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x20, x26]\n"
- "umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x21, x24]\n"
- "umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x20, x24]\n"
- "umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x21, x23]\n"
+ "umax v23.16b, v5.16b, v4.16b\n"
+ "umax v19.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax v22.16b, v1.16b, v0.16b\n"
+ "umax v18.16b, v31.16b, v30.16b\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "umax v21.16b, v29.16b, v21.16b\n"
+ "umax v17.16b, v28.16b, v27.16b\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "umax v20.16b, v26.16b, v20.16b\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"umax v17.16b, v21.16b, v17.16b\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "umax v7.16b, v7.16b, v18.16b\n"
- "umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "umax v5.16b, v5.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "umax v9.16b, v9.16b, v19.16b\n"
+ "umax v8.16b, v8.16b, v18.16b\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v22.16b, v2.16b, v1.16b\n"
- "umax v18.16b, v27.16b, v21.16b\n"
- "umax v21.16b, v0.16b, v31.16b\n"
- "umax v17.16b, v26.16b, v20.16b\n"
- "umax v20.16b, v30.16b, v29.16b\n"
+ "umax v23.16b, v5.16b, v4.16b\n"
+ "umax v19.16b, v3.16b, v2.16b\n"
+ "umax v22.16b, v1.16b, v0.16b\n"
+ "umax v18.16b, v31.16b, v30.16b\n"
+ "umax v21.16b, v29.16b, v21.16b\n"
+ "umax v17.16b, v28.16b, v27.16b\n"
+ "umax v20.16b, v26.16b, v20.16b\n"
"umax v16.16b, v25.16b, v24.16b\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "umax v7.16b, v7.16b, v18.16b\n"
- "umax v6.16b, v6.16b, v17.16b\n"
- "umax v5.16b, v5.16b, v16.16b\n"
+ "umax v9.16b, v9.16b, v19.16b\n"
+ "umax v8.16b, v8.16b, v18.16b\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v16.16b\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "umax v9.16b, v9.16b, v19.16b\n"
+ "umax v8.16b, v8.16b, v18.16b\n"
"umax v7.16b, v7.16b, v17.16b\n"
"umax v6.16b, v6.16b, v16.16b\n"
- "ldr q16, [x20, x23]\n"
- "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x40\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "str q8, [%x[outptr], x27]\n"
- "str q7, [%x[outptr], x26]\n"
+ "str q8, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q7, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
+ "str q6, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q6, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q5, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -163,272 +163,272 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
+ "movi v9.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "umax v16.16b, v17.16b, v16.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "umax v8.16b, v8.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v9.16b, v9.16b, v16.16b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
"umax v16.16b, v17.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "umax v9.16b, v9.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q16, [x20, x9]\n"
+ "umax v9.16b, v9.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "str q9, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "str q8, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x0\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v9.16b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
- "ld1 { v3.b }[14], [x22], #0x1\n"
- "ld1 { v28.b }[14], [x21], #0x1\n"
- "ld1 { v22.b }[14], [x20], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
+ "ld1 { v3.b }[14], [x21], #0x1\n"
+ "ld1 { v2.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
- "ld1 { v3.b }[12], [x22], #0x1\n"
- "ld1 { v28.b }[12], [x21], #0x1\n"
- "ld1 { v22.b }[12], [x20], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
+ "ld1 { v3.b }[12], [x21], #0x1\n"
+ "ld1 { v2.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
- "ld1 { v3.b }[10], [x22], #0x1\n"
- "ld1 { v28.b }[10], [x21], #0x1\n"
- "ld1 { v22.b }[10], [x20], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
+ "ld1 { v3.b }[10], [x21], #0x1\n"
+ "ld1 { v2.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
- "ld1 { v3.b }[8], [x22], #0x1\n"
- "ld1 { v28.b }[8], [x21], #0x1\n"
- "ld1 { v22.b }[8], [x20], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
+ "ld1 { v3.b }[8], [x21], #0x1\n"
+ "ld1 { v2.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
- "ld1 { v3.b }[6], [x22], #0x1\n"
- "ld1 { v28.b }[6], [x21], #0x1\n"
- "ld1 { v22.b }[6], [x20], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
+ "ld1 { v3.b }[6], [x21], #0x1\n"
+ "ld1 { v2.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
- "ld1 { v3.b }[4], [x22], #0x1\n"
- "ld1 { v28.b }[4], [x21], #0x1\n"
- "ld1 { v22.b }[4], [x20], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
+ "ld1 { v3.b }[4], [x21], #0x1\n"
+ "ld1 { v2.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
- "ld1 { v3.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
- "ld1 { v22.b }[2], [x20], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
+ "ld1 { v3.b }[2], [x21], #0x1\n"
+ "ld1 { v2.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x23], #0x1\n"
- "ldr b3, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
- "ldr b22, [x20], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
+ "ldr b3, [x21], #0x1\n"
+ "ldr b2, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
"subs x25, x25, #0x1\n"
"umax v16.16b, v17.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "umax v9.16b, v9.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x23], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v9.16b, v9.16b, v5.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"tbz %x[n_channels], #3, 38f\n"
- "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v9.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
- "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 35f\n"
- "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[6], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[14], [%x[outptr]], #0x1\n"
"b 42f\n"
"35:" // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[12], [%x[outptr]], #0x1\n"
"b 42f\n"
"36:" // Oddments: Store: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 37f\n"
- "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[4], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[10], [%x[outptr]], #0x1\n"
"b 42f\n"
"37:" // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[8], [%x[outptr]], #0x1\n"
"b 42f\n"
"38:" // Oddments: Store: Bit 3: Unset
"tbz %x[n_channels], #2, 40f\n"
- "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v9.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 39f\n"
- "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[2], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[6], [%x[outptr]], #0x1\n"
"b 42f\n"
"39:" // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[4], [%x[outptr]], #0x1\n"
"b 42f\n"
"40:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 41f\n"
- "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v9.h }[0], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[2], [%x[outptr]], #0x1\n"
"b 42f\n"
"41:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
+ "st1 { v9.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 31a3489e5c..0734e9b128 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -128,11 +128,11 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"1:" // 4-vectors of channels
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
"mov v11.16b, v15.16b\n"
- "mov x22, %x[inptrs]\n"
"mov v10.16b, v15.16b\n"
"mov v9.16b, v15.16b\n"
"mov v8.16b, v15.16b\n"
@@ -146,9 +146,9 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v0.16b, v15.16b\n"
"cbz x23, 4f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"ldr q29, [x21, x26]\n"
"ldr q28, [x20, x26]\n"
@@ -161,26 +161,26 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddl v23.8h, v31.8b, v30.8b\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
- "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q29, [x21, x26]\n"
- "ldr q28, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q27, [x21, x25]\n"
- "ldr q26, [x20, x25]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v17.8h, v25.8b, v24.8b\n"
"uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q25, [x21, x24]\n"
- "ldr q24, [x20, x24]\n"
- "subs x23, x23, #0x1\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
- "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -224,17 +224,17 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
- "uxtl v23.8h, v16.8b\n"
- "uxtl2 v22.8h, v16.16b\n"
- "ldr q16, [x20, x26]\n"
+ "subs x23, x23, #0x1\n"
+ "ldr q19, [x20, x27]\n"
+ "ldr q18, [x20, x26]\n"
"ldr q17, [x20, x25]\n"
- "uxtl v21.8h, v16.8b\n"
- "uxtl2 v20.8h, v16.16b\n"
"ldr q16, [x20, x24]\n"
+ "uxtl v23.8h, v19.8b\n"
+ "uxtl2 v22.8h, v19.16b\n"
+ "uxtl v21.8h, v18.8b\n"
+ "uxtl2 v20.8h, v18.16b\n"
"uxtl v19.8h, v17.8b\n"
"uxtl2 v18.8h, v17.16b\n"
- "subs x23, x23, #0x1\n"
"uxtl v17.8h, v16.8b\n"
"uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
@@ -255,95 +255,95 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1r { v19.4s }, [%x[left_shift]]\n"
- "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v14.4s, v14.4s, v19.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v19.4s\n"
- "srshl v12.4s, v12.4s, v19.4s\n"
+ "ld1r { v21.4s }, [%x[left_shift]]\n"
+ "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
- "srshl v11.4s, v11.4s, v19.4s\n"
- "srshl v10.4s, v10.4s, v19.4s\n"
+ "movi v19.4s, #0x0\n"
+ "ld1r { v18.4s }, [%x[right_shift]]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "movi v16.4s, #0xff\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
- "srshl v9.4s, v9.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v19.4s\n"
"cmp %x[n_channels], #0x40\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "srshl v6.4s, v6.4s, v19.4s\n"
- "srshl v5.4s, v5.4s, v19.4s\n"
- "srshl v4.4s, v4.4s, v19.4s\n"
- "srshl v3.4s, v3.4s, v19.4s\n"
- "srshl v2.4s, v2.4s, v19.4s\n"
- "srshl v1.4s, v1.4s, v19.4s\n"
- "srshl v0.4s, v0.4s, v19.4s\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "sqrdmulh v14.4s, v14.4s, v18.4s\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "sqrdmulh v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v11.4s, v11.4s, v18.4s\n"
- "sqrdmulh v10.4s, v10.4s, v18.4s\n"
- "sqrdmulh v9.4s, v9.4s, v18.4s\n"
- "sqrdmulh v8.4s, v8.4s, v18.4s\n"
- "sqrdmulh v7.4s, v7.4s, v18.4s\n"
- "sqrdmulh v6.4s, v6.4s, v18.4s\n"
- "sqrdmulh v5.4s, v5.4s, v18.4s\n"
- "sqrdmulh v4.4s, v4.4s, v18.4s\n"
- "sqrdmulh v3.4s, v3.4s, v18.4s\n"
- "sqrdmulh v2.4s, v2.4s, v18.4s\n"
- "sqrdmulh v1.4s, v1.4s, v18.4s\n"
- "sqrdmulh v0.4s, v0.4s, v18.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
- "srshl v11.4s, v11.4s, v17.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v8.4s, v8.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "srshl v6.4s, v6.4s, v17.4s\n"
- "srshl v5.4s, v5.4s, v17.4s\n"
- "srshl v4.4s, v4.4s, v17.4s\n"
- "srshl v3.4s, v3.4s, v17.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "srshl v1.4s, v1.4s, v17.4s\n"
- "srshl v0.4s, v0.4s, v17.4s\n"
- "add v15.4s, v15.4s, v16.4s\n"
- "add v14.4s, v14.4s, v16.4s\n"
- "add v13.4s, v13.4s, v16.4s\n"
- "add v12.4s, v12.4s, v16.4s\n"
- "add v11.4s, v11.4s, v16.4s\n"
- "add v10.4s, v10.4s, v16.4s\n"
- "add v9.4s, v9.4s, v16.4s\n"
- "add v8.4s, v8.4s, v16.4s\n"
- "add v7.4s, v7.4s, v16.4s\n"
- "add v6.4s, v6.4s, v16.4s\n"
- "add v5.4s, v5.4s, v16.4s\n"
- "add v4.4s, v4.4s, v16.4s\n"
- "add v3.4s, v3.4s, v16.4s\n"
- "add v2.4s, v2.4s, v16.4s\n"
- "add v1.4s, v1.4s, v16.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "smax v7.4s, v7.4s, v16.4s\n"
- "smax v6.4s, v6.4s, v16.4s\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "smax v4.4s, v4.4s, v16.4s\n"
- "smax v3.4s, v3.4s, v16.4s\n"
- "smax v2.4s, v2.4s, v16.4s\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v14.4s, v14.4s, v21.4s\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v12.4s, v12.4s, v21.4s\n"
+ "srshl v11.4s, v11.4s, v21.4s\n"
+ "srshl v10.4s, v10.4s, v21.4s\n"
+ "srshl v9.4s, v9.4s, v21.4s\n"
+ "srshl v8.4s, v8.4s, v21.4s\n"
+ "srshl v7.4s, v7.4s, v21.4s\n"
+ "srshl v6.4s, v6.4s, v21.4s\n"
+ "srshl v5.4s, v5.4s, v21.4s\n"
+ "srshl v4.4s, v4.4s, v21.4s\n"
+ "srshl v3.4s, v3.4s, v21.4s\n"
+ "srshl v2.4s, v2.4s, v21.4s\n"
+ "srshl v1.4s, v1.4s, v21.4s\n"
+ "srshl v0.4s, v0.4s, v21.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v20.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v20.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v20.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v20.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v20.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v20.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v20.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v20.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v20.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v20.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v20.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v20.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v20.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "srshl v11.4s, v11.4s, v18.4s\n"
+ "srshl v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v18.4s\n"
+ "srshl v8.4s, v8.4s, v18.4s\n"
+ "srshl v7.4s, v7.4s, v18.4s\n"
+ "srshl v6.4s, v6.4s, v18.4s\n"
+ "srshl v5.4s, v5.4s, v18.4s\n"
+ "srshl v4.4s, v4.4s, v18.4s\n"
+ "srshl v3.4s, v3.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v18.4s\n"
+ "srshl v1.4s, v1.4s, v18.4s\n"
+ "srshl v0.4s, v0.4s, v18.4s\n"
+ "add v15.4s, v15.4s, v17.4s\n"
+ "add v14.4s, v14.4s, v17.4s\n"
+ "add v13.4s, v13.4s, v17.4s\n"
+ "add v12.4s, v12.4s, v17.4s\n"
+ "add v11.4s, v11.4s, v17.4s\n"
+ "add v10.4s, v10.4s, v17.4s\n"
+ "add v9.4s, v9.4s, v17.4s\n"
+ "add v8.4s, v8.4s, v17.4s\n"
+ "add v7.4s, v7.4s, v17.4s\n"
+ "add v6.4s, v6.4s, v17.4s\n"
+ "add v5.4s, v5.4s, v17.4s\n"
+ "add v4.4s, v4.4s, v17.4s\n"
+ "add v3.4s, v3.4s, v17.4s\n"
+ "add v2.4s, v2.4s, v17.4s\n"
+ "add v1.4s, v1.4s, v17.4s\n"
+ "add v0.4s, v0.4s, v17.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "smax v14.4s, v14.4s, v19.4s\n"
+ "smax v13.4s, v13.4s, v19.4s\n"
+ "smax v12.4s, v12.4s, v19.4s\n"
+ "smax v11.4s, v11.4s, v19.4s\n"
+ "smax v10.4s, v10.4s, v19.4s\n"
+ "smax v9.4s, v9.4s, v19.4s\n"
+ "smax v8.4s, v8.4s, v19.4s\n"
+ "smax v7.4s, v7.4s, v19.4s\n"
+ "smax v6.4s, v6.4s, v19.4s\n"
+ "smax v5.4s, v5.4s, v19.4s\n"
+ "smax v4.4s, v4.4s, v19.4s\n"
+ "smax v3.4s, v3.4s, v19.4s\n"
+ "smax v2.4s, v2.4s, v19.4s\n"
+ "smax v1.4s, v1.4s, v19.4s\n"
+ "smax v0.4s, v0.4s, v19.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
@@ -361,19 +361,19 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"smin v1.4s, v1.4s, v16.4s\n"
"smin v0.4s, v0.4s, v16.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
- "uzp1 v16.16b, v13.16b, v12.16b\n"
+ "uzp1 v19.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
"uzp1 v18.16b, v9.16b, v8.16b\n"
"uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
"uzp1 v20.16b, v3.16b, v2.16b\n"
- "uzp1 v19.16b, v1.16b, v0.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v1.16b, v0.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
"uzp1 v18.16b, v22.16b, v18.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
"uzp1 v17.16b, v21.16b, v17.16b\n"
- "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
"str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
"str q17, [%x[outptr], x25]\n"
@@ -388,29 +388,29 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
- "ldr q31, [x21, x27]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
+ "ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
"uaddl v17.8h, v31.8b, v30.8b\n"
"uaddl2 v16.8h, v31.16b, v30.16b\n"
"ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
"ldr q31, [x21, x27]\n"
"ldr q30, [x20, x27]\n"
- "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v17.4h\n"
"uaddw2 v14.4s, v14.4s, v17.8h\n"
"uaddw v13.4s, v13.4s, v16.4h\n"
"uaddw2 v12.4s, v12.4s, v16.8h\n"
- "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
"uaddl v17.8h, v31.8b, v30.8b\n"
@@ -424,45 +424,45 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x23, x23, #0x1\n"
"ldr q16, [x20, x27]\n"
"uxtl v17.8h, v16.8b\n"
"uxtl2 v16.8h, v16.16b\n"
- "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v17.4h\n"
"uaddw2 v14.4s, v14.4s, v17.8h\n"
"uaddw v13.4s, v13.4s, v16.4h\n"
"uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1r { v16.4s }, [%x[left_shift]]\n"
- "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
+ "ld1r { v21.4s }, [%x[left_shift]]\n"
+ "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "movi v19.4s, #0x0\n"
+ "ld1r { v18.4s }, [%x[right_shift]]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "movi v16.4s, #0xff\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "sqrdmulh v12.4s, v12.4s, v18.4s\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
- "add v15.4s, v15.4s, v16.4s\n"
- "add v14.4s, v14.4s, v16.4s\n"
- "add v13.4s, v13.4s, v16.4s\n"
- "add v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v14.4s, v14.4s, v21.4s\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v12.4s, v12.4s, v21.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v20.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v20.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v20.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "add v15.4s, v15.4s, v17.4s\n"
+ "add v14.4s, v14.4s, v17.4s\n"
+ "add v13.4s, v13.4s, v17.4s\n"
+ "add v12.4s, v12.4s, v17.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "smax v14.4s, v14.4s, v19.4s\n"
+ "smax v13.4s, v13.4s, v19.4s\n"
+ "smax v12.4s, v12.4s, v19.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
@@ -478,18 +478,18 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
"add %x[outptr], %x[outptr], x27\n"
+ "mov x22, %x[inptrs]\n"
"mov v14.16b, v15.16b\n"
"mov v13.16b, v15.16b\n"
"mov v12.16b, v15.16b\n"
- "mov x22, %x[inptrs]\n"
"cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
"ldp x21, x20, [x22, #0x0]\n"
"add x22, x22, #0x10\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
+ "add x21, x21, x27\n"
+ "add x20, x20, x27\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -563,8 +563,8 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x21, [x22], #0x8\n"
- "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
+ "add x21, x21, x27\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -619,33 +619,33 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "ld1r { v16.4s }, [%x[left_shift]]\n"
- "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
- "srshl v15.4s, v15.4s, v16.4s\n"
- "srshl v14.4s, v14.4s, v16.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v13.4s, v13.4s, v16.4s\n"
- "srshl v12.4s, v12.4s, v16.4s\n"
+ "ld1r { v21.4s }, [%x[left_shift]]\n"
+ "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
- "sqrdmulh v15.4s, v15.4s, v18.4s\n"
- "sqrdmulh v14.4s, v14.4s, v18.4s\n"
- "sqrdmulh v13.4s, v13.4s, v18.4s\n"
- "sqrdmulh v12.4s, v12.4s, v18.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
- "add v15.4s, v15.4s, v16.4s\n"
- "add v14.4s, v14.4s, v16.4s\n"
- "add v13.4s, v13.4s, v16.4s\n"
- "add v12.4s, v12.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smax v14.4s, v14.4s, v16.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v16.4s\n"
+ "movi v19.4s, #0x0\n"
+ "ld1r { v18.4s }, [%x[right_shift]]\n"
+ "ld1r { v17.4s }, [x20]\n"
"movi v16.4s, #0xff\n"
+ "srshl v15.4s, v15.4s, v21.4s\n"
+ "srshl v14.4s, v14.4s, v21.4s\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v12.4s, v12.4s, v21.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v20.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v20.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v20.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+ "srshl v15.4s, v15.4s, v18.4s\n"
+ "srshl v14.4s, v14.4s, v18.4s\n"
+ "srshl v13.4s, v13.4s, v18.4s\n"
+ "srshl v12.4s, v12.4s, v18.4s\n"
+ "add v15.4s, v15.4s, v17.4s\n"
+ "add v14.4s, v14.4s, v17.4s\n"
+ "add v13.4s, v13.4s, v17.4s\n"
+ "add v12.4s, v12.4s, v17.4s\n"
+ "smax v15.4s, v15.4s, v19.4s\n"
+ "smax v14.4s, v14.4s, v19.4s\n"
+ "smax v13.4s, v13.4s, v19.4s\n"
+ "smax v12.4s, v12.4s, v19.4s\n"
"smin v15.4s, v15.4s, v16.4s\n"
"smin v14.4s, v14.4s, v16.4s\n"
"smin v13.4s, v13.4s, v16.4s\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index f4927c5536..11a8ad88ec 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,266 +43,266 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
{
__asm__ __volatile__(
"cmp %x[n_channels], #0x40\n"
- "mov x27, #0x0\n"
- "mov x26, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
+ "mov x9, #0x0\n"
+ "mov x28, #0x10\n" // cntb _, ALL, #1
+ "mov x27, #0x20\n" // cntb _, ALL, #2
+ "mov x26, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v6.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
- "movi v6.16b, #0x0\n"
- "movi v5.16b, #0x0\n"
"cbz x25, 4f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldr q2, [x21, x26]\n"
- "ldr q1, [x20, x26]\n"
- "ldr q0, [x21, x24]\n"
- "ldr q31, [x20, x24]\n"
- "ldr q30, [x21, x23]\n"
- "ldr q29, [x20, x23]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "ldr q27, [x20, x27]\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "umax v22.16b, v2.16b, v1.16b\n"
- "ldr q2, [x21, x26]\n"
- "umax v18.16b, v27.16b, v21.16b\n"
- "ldr q1, [x20, x26]\n"
- "umax v21.16b, v0.16b, v31.16b\n"
- "ldr q0, [x21, x24]\n"
- "umax v17.16b, v26.16b, v20.16b\n"
- "ldr q31, [x20, x24]\n"
- "umax v20.16b, v30.16b, v29.16b\n"
- "ldr q30, [x21, x23]\n"
+ "umax v23.16b, v5.16b, v4.16b\n"
+ "umax v19.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax v22.16b, v1.16b, v0.16b\n"
+ "umax v18.16b, v31.16b, v30.16b\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "umax v21.16b, v29.16b, v21.16b\n"
+ "umax v17.16b, v28.16b, v27.16b\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "umax v20.16b, v26.16b, v20.16b\n"
"umax v16.16b, v25.16b, v24.16b\n"
- "ldr q29, [x20, x23]\n"
+ "ldr q1, [x23, x28]\n"
+ "ldr q0, [x22, x28]\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "ldr q31, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
"umax v17.16b, v21.16b, v17.16b\n"
+ "ldr q29, [x23, x27]\n"
+ "ldr q21, [x22, x27]\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x21, x26]\n"
- "ldr q21, [x20, x26]\n"
- "subs x25, x25, #0x1\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "ldr q26, [x21, x24]\n"
- "ldr q20, [x20, x24]\n"
- "umax v7.16b, v7.16b, v18.16b\n"
- "umax v6.16b, v6.16b, v17.16b\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
- "umax v5.16b, v5.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q27, [x20, x27]\n"
+ "umax v6.16b, v6.16b, v19.16b\n"
+ "umax v9.16b, v9.16b, v18.16b\n"
+ "ldr q26, [x23, x26]\n"
+ "ldr q20, [x22, x26]\n"
+ "umax v8.16b, v8.16b, v17.16b\n"
+ "ldr q25, [x21, x26]\n"
+ "ldr q24, [x20, x26]\n"
+ "umax v7.16b, v7.16b, v16.16b\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "umax v23.16b, v4.16b, v3.16b\n"
- "umax v19.16b, v28.16b, v22.16b\n"
- "umax v22.16b, v2.16b, v1.16b\n"
- "umax v18.16b, v27.16b, v21.16b\n"
- "umax v21.16b, v0.16b, v31.16b\n"
- "umax v17.16b, v26.16b, v20.16b\n"
- "umax v20.16b, v30.16b, v29.16b\n"
+ "umax v23.16b, v5.16b, v4.16b\n"
+ "umax v19.16b, v3.16b, v2.16b\n"
+ "umax v22.16b, v1.16b, v0.16b\n"
+ "umax v18.16b, v31.16b, v30.16b\n"
+ "umax v21.16b, v29.16b, v21.16b\n"
+ "umax v17.16b, v28.16b, v27.16b\n"
+ "umax v20.16b, v26.16b, v20.16b\n"
"umax v16.16b, v25.16b, v24.16b\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v19.16b\n"
- "umax v7.16b, v7.16b, v18.16b\n"
- "umax v6.16b, v6.16b, v17.16b\n"
- "umax v5.16b, v5.16b, v16.16b\n"
+ "umax v6.16b, v6.16b, v19.16b\n"
+ "umax v9.16b, v9.16b, v18.16b\n"
+ "umax v8.16b, v8.16b, v17.16b\n"
+ "umax v7.16b, v7.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v16.16b\n"
- "ldr q17, [x20, x26]\n"
- "ldr q16, [x20, x24]\n"
- "umax v7.16b, v7.16b, v17.16b\n"
- "umax v6.16b, v6.16b, v16.16b\n"
- "ldr q16, [x20, x23]\n"
- "umax v5.16b, v5.16b, v16.16b\n"
+ "ldr q19, [x20, x9]\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr q17, [x20, x27]\n"
+ "ldr q16, [x20, x26]\n"
+ "umax v6.16b, v6.16b, v19.16b\n"
+ "umax v9.16b, v9.16b, v18.16b\n"
+ "umax v8.16b, v8.16b, v17.16b\n"
+ "umax v7.16b, v7.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v4.4s }, [x20]\n"
- "uxtl v23.8h, v8.8b\n"
- "uxtl2 v24.8h, v8.16b\n"
- "uxtl v22.8h, v7.8b\n"
- "uxtl2 v21.8h, v7.16b\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "uxtl v23.8h, v6.8b\n"
+ "uxtl2 v19.8h, v6.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v3.4s }, [x20]\n"
- "uxtl v20.8h, v6.8b\n"
- "uxtl2 v17.8h, v6.16b\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v2.4s }, [x20]\n"
- "uxtl v19.8h, v5.8b\n"
- "uxtl2 v18.8h, v5.16b\n"
+ "ld1r { v6.4s }, [x21]\n"
+ "ld1r { v5.4s }, [x20]\n"
+ "uxtl v22.8h, v9.8b\n"
+ "uxtl2 v18.8h, v9.16b\n"
+ "uxtl v21.8h, v8.8b\n"
+ "uxtl2 v17.8h, v8.16b\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v1.4s }, [x20]\n"
- "neg v4.4s, v4.4s\n"
- "saddw v0.4s, v4.4s, v23.4h\n"
+ "ld1r { v4.4s }, [x21]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "uxtl v20.8h, v7.8b\n"
+ "uxtl2 v16.8h, v7.16b\n"
+ "neg v6.4s, v6.4s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
- "saddw2 v23.4s, v4.4s, v23.8h\n"
- "saddw v31.4s, v4.4s, v24.4h\n"
+ "movi v2.4s, #0x0\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "movi v0.4s, #0xff\n"
"cmp %x[n_channels], #0x40\n"
- "saddw2 v30.4s, v4.4s, v24.8h\n"
- "saddw v29.4s, v4.4s, v22.4h\n"
- "saddw2 v22.4s, v4.4s, v22.8h\n"
- "saddw v28.4s, v4.4s, v21.4h\n"
- "saddw2 v21.4s, v4.4s, v21.8h\n"
- "saddw v27.4s, v4.4s, v20.4h\n"
- "saddw2 v20.4s, v4.4s, v20.8h\n"
- "saddw v26.4s, v4.4s, v17.4h\n"
- "saddw2 v17.4s, v4.4s, v17.8h\n"
- "saddw v25.4s, v4.4s, v19.4h\n"
- "saddw2 v19.4s, v4.4s, v19.8h\n"
- "saddw v24.4s, v4.4s, v18.4h\n"
- "saddw2 v18.4s, v4.4s, v18.8h\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
+ "saddw v31.4s, v6.4s, v23.4h\n"
+ "saddw2 v23.4s, v6.4s, v23.8h\n"
+ "saddw v30.4s, v6.4s, v19.4h\n"
+ "saddw2 v19.4s, v6.4s, v19.8h\n"
+ "saddw v29.4s, v6.4s, v22.4h\n"
+ "saddw2 v22.4s, v6.4s, v22.8h\n"
+ "saddw v28.4s, v6.4s, v18.4h\n"
+ "saddw2 v18.4s, v6.4s, v18.8h\n"
+ "saddw v27.4s, v6.4s, v21.4h\n"
+ "saddw2 v21.4s, v6.4s, v21.8h\n"
+ "saddw v26.4s, v6.4s, v17.4h\n"
+ "saddw2 v17.4s, v6.4s, v17.8h\n"
+ "saddw v25.4s, v6.4s, v20.4h\n"
+ "saddw2 v20.4s, v6.4s, v20.8h\n"
+ "saddw v24.4s, v6.4s, v16.4h\n"
+ "saddw2 v16.4s, v6.4s, v16.8h\n"
+ "srshl v31.4s, v31.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v5.4s\n"
+ "srshl v30.4s, v30.4s, v5.4s\n"
+ "srshl v19.4s, v19.4s, v5.4s\n"
+ "srshl v29.4s, v29.4s, v5.4s\n"
+ "srshl v22.4s, v22.4s, v5.4s\n"
+ "srshl v28.4s, v28.4s, v5.4s\n"
+ "srshl v18.4s, v18.4s, v5.4s\n"
+ "srshl v27.4s, v27.4s, v5.4s\n"
+ "srshl v21.4s, v21.4s, v5.4s\n"
+ "srshl v26.4s, v26.4s, v5.4s\n"
+ "srshl v17.4s, v17.4s, v5.4s\n"
+ "srshl v25.4s, v25.4s, v5.4s\n"
+ "srshl v20.4s, v20.4s, v5.4s\n"
+ "srshl v24.4s, v24.4s, v5.4s\n"
+ "srshl v16.4s, v16.4s, v5.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
"srshl v31.4s, v31.4s, v3.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
"srshl v30.4s, v30.4s, v3.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
"srshl v29.4s, v29.4s, v3.4s\n"
"srshl v22.4s, v22.4s, v3.4s\n"
"srshl v28.4s, v28.4s, v3.4s\n"
- "srshl v21.4s, v21.4s, v3.4s\n"
+ "srshl v18.4s, v18.4s, v3.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "srshl v20.4s, v20.4s, v3.4s\n"
+ "srshl v21.4s, v21.4s, v3.4s\n"
"srshl v26.4s, v26.4s, v3.4s\n"
"srshl v17.4s, v17.4s, v3.4s\n"
"srshl v25.4s, v25.4s, v3.4s\n"
- "srshl v19.4s, v19.4s, v3.4s\n"
+ "srshl v20.4s, v20.4s, v3.4s\n"
"srshl v24.4s, v24.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v3.4s\n"
- "sqrdmulh v0.4s, v0.4s, v2.4s\n"
- "sqrdmulh v23.4s, v23.4s, v2.4s\n"
- "sqrdmulh v31.4s, v31.4s, v2.4s\n"
- "sqrdmulh v30.4s, v30.4s, v2.4s\n"
- "sqrdmulh v29.4s, v29.4s, v2.4s\n"
- "sqrdmulh v22.4s, v22.4s, v2.4s\n"
- "sqrdmulh v28.4s, v28.4s, v2.4s\n"
- "sqrdmulh v21.4s, v21.4s, v2.4s\n"
- "sqrdmulh v27.4s, v27.4s, v2.4s\n"
- "sqrdmulh v20.4s, v20.4s, v2.4s\n"
- "sqrdmulh v26.4s, v26.4s, v2.4s\n"
- "sqrdmulh v17.4s, v17.4s, v2.4s\n"
- "sqrdmulh v25.4s, v25.4s, v2.4s\n"
- "sqrdmulh v19.4s, v19.4s, v2.4s\n"
- "sqrdmulh v24.4s, v24.4s, v2.4s\n"
- "sqrdmulh v18.4s, v18.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v1.4s\n"
- "srshl v23.4s, v23.4s, v1.4s\n"
- "srshl v31.4s, v31.4s, v1.4s\n"
- "srshl v30.4s, v30.4s, v1.4s\n"
- "srshl v29.4s, v29.4s, v1.4s\n"
- "srshl v22.4s, v22.4s, v1.4s\n"
- "srshl v28.4s, v28.4s, v1.4s\n"
- "srshl v21.4s, v21.4s, v1.4s\n"
- "srshl v27.4s, v27.4s, v1.4s\n"
- "srshl v20.4s, v20.4s, v1.4s\n"
- "srshl v26.4s, v26.4s, v1.4s\n"
- "srshl v17.4s, v17.4s, v1.4s\n"
- "srshl v25.4s, v25.4s, v1.4s\n"
- "srshl v19.4s, v19.4s, v1.4s\n"
- "srshl v24.4s, v24.4s, v1.4s\n"
- "srshl v18.4s, v18.4s, v1.4s\n"
- "add v0.4s, v0.4s, v16.4s\n"
- "add v23.4s, v23.4s, v16.4s\n"
- "add v31.4s, v31.4s, v16.4s\n"
- "add v30.4s, v30.4s, v16.4s\n"
- "add v29.4s, v29.4s, v16.4s\n"
- "add v22.4s, v22.4s, v16.4s\n"
- "add v28.4s, v28.4s, v16.4s\n"
- "add v21.4s, v21.4s, v16.4s\n"
- "add v27.4s, v27.4s, v16.4s\n"
- "add v20.4s, v20.4s, v16.4s\n"
- "add v26.4s, v26.4s, v16.4s\n"
- "add v17.4s, v17.4s, v16.4s\n"
- "add v25.4s, v25.4s, v16.4s\n"
- "add v19.4s, v19.4s, v16.4s\n"
- "add v24.4s, v24.4s, v16.4s\n"
- "add v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
- "smax v29.4s, v29.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v16.4s\n"
- "smax v28.4s, v28.4s, v16.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "smax v27.4s, v27.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v26.4s, v26.4s, v16.4s\n"
- "smax v17.4s, v17.4s, v16.4s\n"
- "smax v25.4s, v25.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "smax v24.4s, v24.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
- "smin v0.4s, v0.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v16.4s\n"
- "smin v31.4s, v31.4s, v16.4s\n"
- "smin v30.4s, v30.4s, v16.4s\n"
- "smin v29.4s, v29.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v16.4s\n"
- "smin v28.4s, v28.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v16.4s\n"
- "smin v27.4s, v27.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v16.4s\n"
- "smin v26.4s, v26.4s, v16.4s\n"
- "smin v17.4s, v17.4s, v16.4s\n"
- "smin v25.4s, v25.4s, v16.4s\n"
- "smin v19.4s, v19.4s, v16.4s\n"
- "smin v24.4s, v24.4s, v16.4s\n"
- "smin v18.4s, v18.4s, v16.4s\n"
- "uzp1 v23.16b, v0.16b, v23.16b\n"
- "uzp1 v16.16b, v31.16b, v30.16b\n"
+ "srshl v16.4s, v16.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v1.4s\n"
+ "add v19.4s, v19.4s, v1.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v1.4s\n"
+ "add v28.4s, v28.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v1.4s\n"
+ "add v27.4s, v27.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v1.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v20.4s, v20.4s, v1.4s\n"
+ "add v24.4s, v24.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v1.4s\n"
+ "smax v31.4s, v31.4s, v2.4s\n"
+ "smax v23.4s, v23.4s, v2.4s\n"
+ "smax v30.4s, v30.4s, v2.4s\n"
+ "smax v19.4s, v19.4s, v2.4s\n"
+ "smax v29.4s, v29.4s, v2.4s\n"
+ "smax v22.4s, v22.4s, v2.4s\n"
+ "smax v28.4s, v28.4s, v2.4s\n"
+ "smax v18.4s, v18.4s, v2.4s\n"
+ "smax v27.4s, v27.4s, v2.4s\n"
+ "smax v21.4s, v21.4s, v2.4s\n"
+ "smax v26.4s, v26.4s, v2.4s\n"
+ "smax v17.4s, v17.4s, v2.4s\n"
+ "smax v25.4s, v25.4s, v2.4s\n"
+ "smax v20.4s, v20.4s, v2.4s\n"
+ "smax v24.4s, v24.4s, v2.4s\n"
+ "smax v16.4s, v16.4s, v2.4s\n"
+ "smin v31.4s, v31.4s, v0.4s\n"
+ "smin v23.4s, v23.4s, v0.4s\n"
+ "smin v30.4s, v30.4s, v0.4s\n"
+ "smin v19.4s, v19.4s, v0.4s\n"
+ "smin v29.4s, v29.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v0.4s\n"
+ "smin v27.4s, v27.4s, v0.4s\n"
+ "smin v21.4s, v21.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v0.4s\n"
+ "smin v17.4s, v17.4s, v0.4s\n"
+ "smin v25.4s, v25.4s, v0.4s\n"
+ "smin v20.4s, v20.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v0.4s\n"
+ "smin v16.4s, v16.4s, v0.4s\n"
+ "uzp1 v23.16b, v31.16b, v23.16b\n"
+ "uzp1 v19.16b, v30.16b, v19.16b\n"
"uzp1 v22.16b, v29.16b, v22.16b\n"
- "uzp1 v21.16b, v28.16b, v21.16b\n"
- "uzp1 v20.16b, v27.16b, v20.16b\n"
+ "uzp1 v18.16b, v28.16b, v18.16b\n"
+ "uzp1 v21.16b, v27.16b, v21.16b\n"
"uzp1 v17.16b, v26.16b, v17.16b\n"
- "uzp1 v19.16b, v25.16b, v19.16b\n"
- "uzp1 v18.16b, v24.16b, v18.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x27]\n"
+ "uzp1 v20.16b, v25.16b, v20.16b\n"
+ "uzp1 v16.16b, v24.16b, v16.16b\n"
+ "uzp1 v19.16b, v23.16b, v19.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v16.16b\n"
+ "str q19, [%x[outptr], x9]\n"
+ "add x9, x9, #0x40\n"
+ "str q18, [%x[outptr], x28]\n"
+ "add x28, x28, #0x40\n"
+ "str q17, [%x[outptr], x27]\n"
"add x27, x27, #0x40\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
"str q16, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
- "str q17, [%x[outptr], x24]\n"
- "add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -310,314 +310,314 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "movi v8.16b, #0x0\n"
- "mov x22, %x[inptrs]\n"
+ "movi v6.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q3, [x20, x27]\n"
- "ldp x21, x20, [x22, #0x10]\n"
- "add x22, x22, #0x20\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
- "ldp x21, x20, [x22, #0x0]\n"
- "ldr q4, [x21, x27]\n"
- "ldr q3, [x20, x27]\n"
- "umax v16.16b, v17.16b, v16.16b\n"
- "ldp x21, x20, [x22, #0x10]\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
- "ldr q28, [x21, x27]\n"
- "ldr q22, [x20, x27]\n"
- "umax v8.16b, v8.16b, v16.16b\n"
- "add x22, x22, #0x20\n"
+ "add x24, x24, #0x20\n"
+ "ldr q5, [x23, x9]\n"
+ "ldr q4, [x22, x9]\n"
+ "ldr q3, [x21, x9]\n"
+ "ldr q2, [x20, x9]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
"umax v16.16b, v17.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x20, [x22], #0x8\n"
- "ldr q16, [x20, x27]\n"
+ "ldr x20, [x24], #0x8\n"
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q16, [x20, x9]\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v18.4s }, [x20]\n"
- "uxtl v17.8h, v8.8b\n"
- "uxtl2 v16.8h, v8.16b\n"
- "neg v18.4s, v18.4s\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "uxtl v17.8h, v6.8b\n"
+ "uxtl2 v26.8h, v6.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v23.4s }, [x20]\n"
- "saddw v22.4s, v18.4s, v17.4h\n"
- "saddw2 v21.4s, v18.4s, v17.8h\n"
- "saddw v20.4s, v18.4s, v16.4h\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v19.4s }, [x20]\n"
- "saddw2 v18.4s, v18.4s, v16.8h\n"
- "srshl v22.4s, v22.4s, v23.4s\n"
+ "ld1r { v16.4s }, [x21]\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v17.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v23.4s\n"
- "srshl v20.4s, v20.4s, v23.4s\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v23.4s }, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
- "srshl v18.4s, v18.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "movi v22.4s, #0x0\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "movi v20.4s, #0xff\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
+ "neg v16.4s, v16.4s\n"
"cmp %x[n_channels], #0x10\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqrdmulh v20.4s, v20.4s, v19.4s\n"
- "sqrdmulh v18.4s, v18.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v17.4s\n"
- "srshl v21.4s, v21.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v17.4s\n"
- "srshl v18.4s, v18.4s, v17.4s\n"
- "add v22.4s, v22.4s, v16.4s\n"
- "add v21.4s, v21.4s, v16.4s\n"
- "add v20.4s, v20.4s, v16.4s\n"
- "add v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v22.4s, v22.4s, v16.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
- "smin v22.4s, v22.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v16.4s\n"
- "smin v18.4s, v18.4s, v16.4s\n"
- "uzp1 v17.16b, v22.16b, v21.16b\n"
- "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "saddw v19.4s, v16.4s, v17.4h\n"
+ "saddw2 v17.4s, v16.4s, v17.8h\n"
+ "saddw v18.4s, v16.4s, v26.4h\n"
+ "saddw2 v16.4s, v16.4s, v26.8h\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "srshl v17.4s, v17.4s, v25.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v25.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v23.4s\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "srshl v16.4s, v16.4s, v23.4s\n"
+ "add v19.4s, v19.4s, v21.4s\n"
+ "add v17.4s, v17.4s, v21.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v16.4s, v16.4s, v21.4s\n"
+ "smax v19.4s, v19.4s, v22.4s\n"
+ "smax v17.4s, v17.4s, v22.4s\n"
+ "smax v18.4s, v18.4s, v22.4s\n"
+ "smax v16.4s, v16.4s, v22.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "uzp1 v17.16b, v19.16b, v17.16b\n"
+ "uzp1 v16.16b, v18.16b, v16.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [%x[outptr], x27]\n"
- "add x27, x27, #0x10\n"
+ "str q16, [%x[outptr], x9]\n"
+ "add x9, x9, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"lsr x25, %x[n_valid_cells], #0x2\n"
- "add %x[outptr], %x[outptr], x27\n"
- "movi v8.16b, #0x0\n"
+ "add %x[outptr], %x[outptr], x9\n"
+ "movi v6.16b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "add x23, x23, x27\n"
- "add x22, x22, x27\n"
- "add x21, x21, x27\n"
+ "movi v5.16b, #0x0\n"
"movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "add x20, x20, x27\n"
- "movi v28.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
+ "movi v2.16b, #0x0\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d4, [x23], #0x8\n"
- "ldr d3, [x22], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
+ "ldr d4, [x22], #0x8\n"
+ "ldr d3, [x21], #0x8\n"
+ "ldr d2, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
- "ld1 { v3.s }[2], [x22], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x22], #0x4\n"
+ "ld1 { v3.s }[2], [x21], #0x4\n"
+ "ld1 { v2.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
- "ld1 { v3.h }[6], [x22], #0x2\n"
- "ld1 { v28.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x22], #0x2\n"
+ "ld1 { v3.h }[6], [x21], #0x2\n"
+ "ld1 { v2.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
- "ld1 { v3.b }[14], [x22], #0x1\n"
- "ld1 { v28.b }[14], [x21], #0x1\n"
- "ld1 { v22.b }[14], [x20], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x22], #0x1\n"
+ "ld1 { v3.b }[14], [x21], #0x1\n"
+ "ld1 { v2.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
- "ld1 { v3.b }[12], [x22], #0x1\n"
- "ld1 { v28.b }[12], [x21], #0x1\n"
- "ld1 { v22.b }[12], [x20], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x22], #0x1\n"
+ "ld1 { v3.b }[12], [x21], #0x1\n"
+ "ld1 { v2.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
- "ld1 { v3.h }[4], [x22], #0x2\n"
- "ld1 { v28.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x22], #0x2\n"
+ "ld1 { v3.h }[4], [x21], #0x2\n"
+ "ld1 { v2.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
- "ld1 { v3.b }[10], [x22], #0x1\n"
- "ld1 { v28.b }[10], [x21], #0x1\n"
- "ld1 { v22.b }[10], [x20], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x22], #0x1\n"
+ "ld1 { v3.b }[10], [x21], #0x1\n"
+ "ld1 { v2.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
- "ld1 { v3.b }[8], [x22], #0x1\n"
- "ld1 { v28.b }[8], [x21], #0x1\n"
- "ld1 { v22.b }[8], [x20], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x22], #0x1\n"
+ "ld1 { v3.b }[8], [x21], #0x1\n"
+ "ld1 { v2.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s4, [x23], #0x4\n"
- "ldr s3, [x22], #0x4\n"
- "ldr s28, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s3, [x21], #0x4\n"
+ "ldr s2, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
- "ld1 { v3.h }[2], [x22], #0x2\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x22], #0x2\n"
+ "ld1 { v3.h }[2], [x21], #0x2\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
- "ld1 { v3.b }[6], [x22], #0x1\n"
- "ld1 { v28.b }[6], [x21], #0x1\n"
- "ld1 { v22.b }[6], [x20], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x22], #0x1\n"
+ "ld1 { v3.b }[6], [x21], #0x1\n"
+ "ld1 { v2.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
- "ld1 { v3.b }[4], [x22], #0x1\n"
- "ld1 { v28.b }[4], [x21], #0x1\n"
- "ld1 { v22.b }[4], [x20], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x22], #0x1\n"
+ "ld1 { v3.b }[4], [x21], #0x1\n"
+ "ld1 { v2.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h4, [x23], #0x2\n"
- "ldr h3, [x22], #0x2\n"
- "ldr h28, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h3, [x21], #0x2\n"
+ "ldr h2, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
- "ld1 { v3.b }[2], [x22], #0x1\n"
- "ld1 { v28.b }[2], [x21], #0x1\n"
- "ld1 { v22.b }[2], [x20], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x22], #0x1\n"
+ "ld1 { v3.b }[2], [x21], #0x1\n"
+ "ld1 { v2.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b4, [x23], #0x1\n"
- "ldr b3, [x22], #0x1\n"
- "ldr b28, [x21], #0x1\n"
- "ldr b22, [x20], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
+ "ldr b4, [x22], #0x1\n"
+ "ldr b3, [x21], #0x1\n"
+ "ldr b2, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v17.16b, v4.16b, v3.16b\n"
- "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v17.16b, v5.16b, v4.16b\n"
+ "umax v16.16b, v3.16b, v2.16b\n"
"subs x25, x25, #0x1\n"
"umax v16.16b, v17.16b, v16.16b\n"
- "umax v8.16b, v8.16b, v16.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
"ldr x23, [x24], #0x8\n"
- "add x23, x23, x27\n"
- "movi v4.16b, #0x0\n"
+ "movi v5.16b, #0x0\n"
+ "add x23, x23, x9\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d4, [x23], #0x8\n"
+ "ldr d5, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v5.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v5.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v5.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v5.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v5.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v5.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v5.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v5.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v5.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h4, [x23], #0x2\n"
+ "ldr h5, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v5.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b4, [x23], #0x1\n"
+ "ldr b5, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
"subs x21, x21, #0x1\n"
- "umax v8.16b, v8.16b, v4.16b\n"
+ "umax v6.16b, v6.16b, v5.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v18.4s }, [x20]\n"
- "uxtl v17.8h, v8.8b\n"
- "uxtl2 v16.8h, v8.16b\n"
- "neg v18.4s, v18.4s\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "uxtl v17.8h, v6.8b\n"
+ "uxtl2 v26.8h, v6.16b\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1r { v23.4s }, [x20]\n"
- "saddw v22.4s, v18.4s, v17.4h\n"
- "saddw2 v21.4s, v18.4s, v17.8h\n"
- "saddw v20.4s, v18.4s, v16.4h\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v19.4s }, [x20]\n"
- "saddw2 v18.4s, v18.4s, v16.8h\n"
- "srshl v22.4s, v22.4s, v23.4s\n"
+ "ld1r { v16.4s }, [x21]\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1r { v17.4s }, [x20]\n"
- "srshl v21.4s, v21.4s, v23.4s\n"
- "srshl v20.4s, v20.4s, v23.4s\n"
+ "ld1r { v24.4s }, [x21]\n"
+ "ld1r { v23.4s }, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1r { v16.4s }, [x20]\n"
+ "movi v22.4s, #0x0\n"
+ "ld1r { v21.4s }, [x20]\n"
+ "movi v20.4s, #0xff\n"
+ "neg v16.4s, v16.4s\n"
+ "saddw v19.4s, v16.4s, v17.4h\n"
+ "saddw2 v17.4s, v16.4s, v17.8h\n"
+ "saddw v18.4s, v16.4s, v26.4h\n"
+ "saddw2 v16.4s, v16.4s, v26.8h\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "srshl v17.4s, v17.4s, v25.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v25.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v24.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v24.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v24.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "srshl v17.4s, v17.4s, v23.4s\n"
"srshl v18.4s, v18.4s, v23.4s\n"
- "sqrdmulh v22.4s, v22.4s, v19.4s\n"
- "sqrdmulh v21.4s, v21.4s, v19.4s\n"
- "sqrdmulh v20.4s, v20.4s, v19.4s\n"
- "sqrdmulh v18.4s, v18.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v17.4s\n"
- "srshl v21.4s, v21.4s, v17.4s\n"
- "srshl v20.4s, v20.4s, v17.4s\n"
- "srshl v18.4s, v18.4s, v17.4s\n"
- "add v22.4s, v22.4s, v16.4s\n"
- "add v21.4s, v21.4s, v16.4s\n"
- "add v20.4s, v20.4s, v16.4s\n"
- "add v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0x0\n"
- "smax v22.4s, v22.4s, v16.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "movi v16.4s, #0xff\n"
- "smin v22.4s, v22.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v16.4s\n"
- "smin v18.4s, v18.4s, v16.4s\n"
- "uzp1 v17.16b, v22.16b, v21.16b\n"
- "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "srshl v16.4s, v16.4s, v23.4s\n"
+ "add v19.4s, v19.4s, v21.4s\n"
+ "add v17.4s, v17.4s, v21.4s\n"
+ "add v18.4s, v18.4s, v21.4s\n"
+ "add v16.4s, v16.4s, v21.4s\n"
+ "smax v19.4s, v19.4s, v22.4s\n"
+ "smax v17.4s, v17.4s, v22.4s\n"
+ "smax v18.4s, v18.4s, v22.4s\n"
+ "smax v16.4s, v16.4s, v22.4s\n"
+ "smin v19.4s, v19.4s, v20.4s\n"
+ "smin v17.4s, v17.4s, v20.4s\n"
+ "smin v18.4s, v18.4s, v20.4s\n"
+ "smin v16.4s, v16.4s, v20.4s\n"
+ "uzp1 v17.16b, v19.16b, v17.16b\n"
+ "uzp1 v16.16b, v18.16b, v16.16b\n"
"uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
@@ -667,7 +667,7 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"43:" // End
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 67b07205cd..672a9aefe0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,111 +89,111 @@ void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"whilelt p0.h, XZR, x20\n"
"add x20, %x[args], %[offsetof_rescale]\n"
- "ld1rqh { z4.h }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.h, x3, x5\n"
"mov x6, #0x0\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
+ "whilelt p0.h, x3, x5\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z4.h }, p0/Z, [x14, x3, LSL #1]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x13, x3, LSL #1]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x10, x3, LSL #1]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x9, x3, LSL #1]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
- "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
- "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
- "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z0.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z22.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z21.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
"whilelt p1.h, x3, x5\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.h, z1.h, z0.h\n"
- "fadd z16.h, z31.h, z30.h\n"
- "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "fadd z19.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "ld1h { z2.h }, p1/Z, [x10, x3, LSL #1]\n"
"whilelt p0.h, x6, x5\n"
- "fadd z19.h, z17.h, z16.h\n"
- "fadd z18.h, z3.h, z2.h\n"
- "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
- "fadd z17.h, z29.h, z28.h\n"
- "fadd z22.h, z27.h, z22.h\n"
- "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
- "fadd z16.h, z21.h, z20.h\n"
- "fadd z21.h, z18.h, z19.h\n"
- "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
- "fadd z20.h, z16.h, z19.h\n"
- "fadd z19.h, z26.h, z17.h\n"
- "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
- "fadd z18.h, z25.h, z22.h\n"
- "fadd z17.h, z24.h, z17.h\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
- "fadd z16.h, z23.h, z22.h\n"
- "fadd z19.h, z21.h, z19.h\n"
- "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
- "fadd z18.h, z21.h, z18.h\n"
- "fadd z17.h, z17.h, z20.h\n"
- "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
- "fadd z16.h, z16.h, z20.h\n"
- "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
- "fmul z19.h, z19.h, z4.h[0]\n"
- "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
- "fmul z18.h, z18.h, z4.h[1]\n"
- "fmul z17.h, z17.h, z4.h[2]\n"
- "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
- "fmul z16.h, z16.h, z4.h[3]\n"
+ "fadd z23.h, z4.h, z3.h\n"
+ "fadd z18.h, z30.h, z29.h\n"
+ "ld1h { z1.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "fadd z17.h, z28.h, z27.h\n"
+ "fadd z22.h, z26.h, z22.h\n"
+ "ld1h { z0.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z31.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "fadd z16.h, z19.h, z16.h\n"
+ "ld1h { z4.h }, p1/Z, [x14, x3, LSL #1]\n"
+ "fadd z19.h, z25.h, z18.h\n"
+ "fadd z21.h, z21.h, z18.h\n"
+ "ld1h { z3.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "fadd z18.h, z24.h, z17.h\n"
+ "fadd z20.h, z20.h, z17.h\n"
+ "ld1h { z30.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z29.h }, p1/Z, [x27, x3, LSL #1]\n"
+ "fadd z17.h, z23.h, z16.h\n"
+ "fadd z16.h, z22.h, z16.h\n"
+ "ld1h { z28.h }, p1/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z22.h }, p1/Z, [x21, x3, LSL #1]\n"
+ "fadd z19.h, z17.h, z19.h\n"
+ "fadd z18.h, z17.h, z18.h\n"
+ "ld1h { z25.h }, p1/Z, [x15, x3, LSL #1]\n"
+ "fadd z17.h, z21.h, z16.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
+ "ld1h { z24.h }, p1/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "incw x3\n"
+ "whilelt p1.h, x3, x5\n"
+ "fmul z19.h, z19.h, z5.h[0]\n"
+ "fmul z18.h, z18.h, z5.h[1]\n"
+ "fmul z17.h, z17.h, z5.h[2]\n"
+ "fmul z16.h, z16.h, z5.h[3]\n"
"st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
"st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
"st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
"st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
"incw x6\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
- "incw x3\n"
- "whilelt p1.h, x3, x5\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.h, z1.h, z0.h\n"
- "fadd z16.h, z31.h, z30.h\n"
+ "fadd z19.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
"whilelt p0.h, x6, x5\n"
- "fadd z19.h, z17.h, z16.h\n"
- "fadd z18.h, z3.h, z2.h\n"
- "fadd z17.h, z29.h, z28.h\n"
- "fadd z22.h, z27.h, z22.h\n"
- "fadd z16.h, z21.h, z20.h\n"
- "fadd z21.h, z18.h, z19.h\n"
- "fadd z20.h, z16.h, z19.h\n"
- "fadd z19.h, z26.h, z17.h\n"
- "fadd z18.h, z25.h, z22.h\n"
- "fadd z17.h, z24.h, z17.h\n"
- "fadd z16.h, z23.h, z22.h\n"
- "fadd z19.h, z21.h, z19.h\n"
- "fadd z18.h, z21.h, z18.h\n"
- "fadd z17.h, z17.h, z20.h\n"
- "fadd z16.h, z16.h, z20.h\n"
- "fmul z19.h, z19.h, z4.h[0]\n"
+ "fadd z23.h, z4.h, z3.h\n"
+ "fadd z18.h, z30.h, z29.h\n"
+ "fadd z17.h, z28.h, z27.h\n"
+ "fadd z22.h, z26.h, z22.h\n"
+ "fadd z16.h, z19.h, z16.h\n"
+ "fadd z19.h, z25.h, z18.h\n"
+ "fadd z21.h, z21.h, z18.h\n"
+ "fadd z18.h, z24.h, z17.h\n"
+ "fadd z20.h, z20.h, z17.h\n"
+ "fadd z17.h, z23.h, z16.h\n"
+ "fadd z16.h, z22.h, z16.h\n"
+ "fadd z19.h, z17.h, z19.h\n"
+ "fadd z18.h, z17.h, z18.h\n"
+ "fadd z17.h, z21.h, z16.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
+ "fmul z19.h, z19.h, z5.h[0]\n"
+ "fmul z18.h, z18.h, z5.h[1]\n"
+ "fmul z17.h, z17.h, z5.h[2]\n"
+ "fmul z16.h, z16.h, z5.h[3]\n"
"st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
- "fmul z18.h, z18.h, z4.h[1]\n"
- "fmul z17.h, z17.h, z4.h[2]\n"
"st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
- "fmul z16.h, z16.h, z4.h[3]\n"
"st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
"st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 60f17b7bc2..dee5b4a230 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,25 +49,25 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"cnth x26, ALL, MUL #3\n"
"ptrue p0.b\n"
"whilelt p3.h, x9, %x[n_channels]\n"
- "ld1rh { z6.h }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p2.h, x28, %x[n_channels]\n"
+ "ld1rh { z5.h }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p1.h, x27, %x[n_channels]\n"
"whilelt p0.h, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
+ "mov z1.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z23.h }, p3/Z, [x22, x9, LSL #1]\n"
"ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
@@ -84,7 +84,7 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd z23.h, z1.h, z0.h\n"
+ "fadd z23.h, z0.h, z23.h\n"
"fadd z19.h, z31.h, z30.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
@@ -94,24 +94,24 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
+ "ld1h { z23.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z18.h, z22.h, z18.h\n"
"ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
- "fadd z16.h, z20.h, z16.h\n"
"ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
- "fadd z5.h, z5.h, z19.h\n"
- "fadd z4.h, z4.h, z18.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
"ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
- "fadd z3.h, z3.h, z17.h\n"
- "fadd z2.h, z2.h, z16.h\n"
+ "fadd z4.h, z4.h, z19.h\n"
"ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "fadd z3.h, z3.h, z18.h\n"
"ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "fadd z2.h, z2.h, z17.h\n"
"ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fadd z1.h, z1.h, z16.h\n"
"ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
"ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
"ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
@@ -122,7 +122,7 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd z23.h, z1.h, z0.h\n"
+ "fadd z23.h, z0.h, z23.h\n"
"fadd z19.h, z31.h, z30.h\n"
"fadd z22.h, z29.h, z22.h\n"
"fadd z18.h, z28.h, z18.h\n"
@@ -134,37 +134,37 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"fadd z18.h, z22.h, z18.h\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "fadd z5.h, z5.h, z19.h\n"
- "fadd z4.h, z4.h, z18.h\n"
- "fadd z3.h, z3.h, z17.h\n"
- "fadd z2.h, z2.h, z16.h\n"
+ "fadd z4.h, z4.h, z19.h\n"
+ "fadd z3.h, z3.h, z18.h\n"
+ "fadd z2.h, z2.h, z17.h\n"
+ "fadd z1.h, z1.h, z16.h\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
- "fadd z4.h, z4.h, z16.h\n"
- "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
- "fadd z3.h, z3.h, z16.h\n"
+ "ld1h { z19.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
"ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
- "fadd z2.h, z2.h, z16.h\n"
+ "fadd z4.h, z4.h, z19.h\n"
+ "fadd z3.h, z3.h, z18.h\n"
+ "fadd z2.h, z2.h, z17.h\n"
+ "fadd z1.h, z1.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul z5.h, z5.h, z6.h\n"
- "fmul z4.h, z4.h, z6.h\n"
- "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "fmul z4.h, z4.h, z5.h\n"
+ "fmul z3.h, z3.h, z5.h\n"
+ "fmul z2.h, z2.h, z5.h\n"
+ "fmul z1.h, z1.h, z5.h\n"
+ "st1h { z4.h }, p3, [%x[outptr], x9, LSL #1]\n"
"inch x9, ALL, MUL #4\n"
- "fmul z3.h, z3.h, z6.h\n"
- "fmul z2.h, z2.h, z6.h\n"
- "st1h { z4.h }, p2, [%x[outptr], x28, LSL #1]\n"
+ "st1h { z3.h }, p2, [%x[outptr], x28, LSL #1]\n"
"inch x28, ALL, MUL #4\n"
- "st1h { z3.h }, p1, [%x[outptr], x27, LSL #1]\n"
+ "st1h { z2.h }, p1, [%x[outptr], x27, LSL #1]\n"
"inch x27, ALL, MUL #4\n"
- "st1h { z2.h }, p0, [%x[outptr], x26, LSL #1]\n"
+ "st1h { z1.h }, p0, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
"whilelt p0.h, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -173,49 +173,49 @@ void sme_fp16_nhwc_avg_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z5.b, #0x0\n"
+ "mov z4.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z23.h }, p3/Z, [x22, x9, LSL #1]\n"
"ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z17.h, z1.h, z0.h\n"
+ "fadd z17.h, z0.h, z23.h\n"
"fadd z16.h, z31.h, z30.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z16.h, z17.h, z16.h\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fadd z5.h, z5.h, z16.h\n"
"add x24, x24, #0x20\n"
- "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "ld1h { z23.h }, p3/Z, [x22, x9, LSL #1]\n"
"ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
"ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fadd z4.h, z4.h, z16.h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z17.h, z1.h, z0.h\n"
+ "fadd z17.h, z0.h, z23.h\n"
"fadd z16.h, z31.h, z30.h\n"
"fadd z16.h, z17.h, z16.h\n"
- "fadd z5.h, z5.h, z16.h\n"
+ "fadd z4.h, z4.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.h, z5.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fadd z4.h, z4.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul z5.h, z5.h, z6.h\n"
- "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "fmul z4.h, z4.h, z5.h\n"
+ "st1h { z4.h }, p3, [%x[outptr], x9, LSL #1]\n"
"inch x9\n"
"whilelt p3.h, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 7fc776ed4e..7c2ca4c452 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,26 +66,26 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"mov x15, #0x0\n"
- "ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ptrue p2.b\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.h, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
+ "whilelt p0.h, x15, x13\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
"ldr x20, [x20, #0x40]\n"
+ "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
"ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
"ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
"ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
- "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x15, LSL #1]\n"
"ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
@@ -98,24 +98,24 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
"movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
"ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
- "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z20.h\n"
"movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
"ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
"ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
"movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
"movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
- "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
"movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
"movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
- "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
- "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
- "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
- "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
- "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x22, x15, LSL #1]\n"
"ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
"incw x15\n"
"whilelt p1.h, x15, x13\n"
+ "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+ "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+ "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
"incw x14\n"
"b.any 1b\n"
@@ -123,15 +123,15 @@ void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
"movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
"whilelt p0.h, x14, x13\n"
- "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
- "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
- "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
- "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
- "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
- "fmax z18.h, p2/M, z18.h, z22.h\n"
- "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
- "fmax z17.h, p2/M, z17.h, z21.h\n"
- "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+ "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
"st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
"st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
"st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
index afa2ccbd71..bfdf1b8b5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,21 +53,21 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.h, #0xfc00\n"
"mov z4.h, #0xfc00\n"
- "mov z3.h, #0xfc00\n"
"mov x24, %x[inptrs]\n"
+ "mov z3.h, #0xfc00\n"
"mov z2.h, #0xfc00\n"
- "mov z1.h, #0xfc00\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x22, x9, LSL #1]\n"
"ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
- "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x23, x28, LSL #1]\n"
"ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
"ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
"ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
@@ -81,34 +81,34 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
+ "movprfx z19, z1\n fmax z19.h, p0/M, z19.h, z0.h\n"
+ "fmax z23.h, p0/M, z23.h, z31.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z18.h, p0/M, z18.h, z29.h\n"
+ "movprfx z18, z30\n fmax z18.h, p0/M, z18.h, z29.h\n"
"fmax z22.h, p0/M, z22.h, z28.h\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"fmax z17.h, p0/M, z17.h, z27.h\n"
"fmax z21.h, p0/M, z21.h, z26.h\n"
- "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
"fmax z16.h, p0/M, z16.h, z25.h\n"
"fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x23, x9, LSL #1]\n"
"fmax z19.h, p0/M, z19.h, z23.h\n"
+ "ld1h { z0.h }, p4/Z, [x22, x9, LSL #1]\n"
"fmax z18.h, p0/M, z18.h, z22.h\n"
"ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
+ "ld1h { z31.h }, p4/Z, [x20, x9, LSL #1]\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
- "fmax z3.h, p0/M, z3.h, z18.h\n"
- "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fmax z2.h, p0/M, z2.h, z17.h\n"
- "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "ld1h { z30.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z19.h\n"
"ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "fmax z4.h, p0/M, z4.h, z18.h\n"
"ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "fmax z3.h, p0/M, z3.h, z17.h\n"
"ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
"ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
"ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
"ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
@@ -119,9 +119,9 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
- "fmax z23.h, p0/M, z23.h, z30.h\n"
- "fmax z18.h, p0/M, z18.h, z29.h\n"
+ "movprfx z19, z1\n fmax z19.h, p0/M, z19.h, z0.h\n"
+ "fmax z23.h, p0/M, z23.h, z31.h\n"
+ "movprfx z18, z30\n fmax z18.h, p0/M, z18.h, z29.h\n"
"fmax z22.h, p0/M, z22.h, z28.h\n"
"fmax z17.h, p0/M, z17.h, z27.h\n"
"fmax z21.h, p0/M, z21.h, z26.h\n"
@@ -131,33 +131,33 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"fmax z18.h, p0/M, z18.h, z22.h\n"
"fmax z17.h, p0/M, z17.h, z21.h\n"
"fmax z16.h, p0/M, z16.h, z20.h\n"
- "fmax z4.h, p0/M, z4.h, z19.h\n"
- "fmax z3.h, p0/M, z3.h, z18.h\n"
- "fmax z2.h, p0/M, z2.h, z17.h\n"
- "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "fmax z5.h, p0/M, z5.h, z19.h\n"
+ "fmax z4.h, p0/M, z4.h, z18.h\n"
+ "fmax z3.h, p0/M, z3.h, z17.h\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z16.h\n"
- "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
- "fmax z3.h, p0/M, z3.h, z16.h\n"
- "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
- "fmax z2.h, p0/M, z2.h, z16.h\n"
+ "ld1h { z19.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x20, x27, LSL #1]\n"
"ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
- "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "fmax z5.h, p0/M, z5.h, z19.h\n"
+ "fmax z4.h, p0/M, z4.h, z18.h\n"
+ "fmax z3.h, p0/M, z3.h, z17.h\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "st1h { z5.h }, p4, [%x[outptr], x9, LSL #1]\n"
"inch x9, ALL, MUL #4\n"
- "st1h { z3.h }, p3, [%x[outptr], x28, LSL #1]\n"
+ "st1h { z4.h }, p3, [%x[outptr], x28, LSL #1]\n"
"inch x28, ALL, MUL #4\n"
- "st1h { z2.h }, p2, [%x[outptr], x27, LSL #1]\n"
+ "st1h { z3.h }, p2, [%x[outptr], x27, LSL #1]\n"
"inch x27, ALL, MUL #4\n"
- "st1h { z1.h }, p1, [%x[outptr], x26, LSL #1]\n"
+ "st1h { z2.h }, p1, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
"whilelt p1.h, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -166,48 +166,48 @@ void sme_fp16_nhwc_max_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.h, #0xfc00\n"
+ "mov z5.h, #0xfc00\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p4/Z, [x22, x9, LSL #1]\n"
"ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
- "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "movprfx z16, z1\n fmax z16.h, p0/M, z16.h, z0.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z31.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z16.h, p0/M, z16.h, z17.h\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fmax z4.h, p0/M, z4.h, z16.h\n"
"add x24, x24, #0x20\n"
- "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "ld1h { z0.h }, p4/Z, [x22, x9, LSL #1]\n"
"ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
- "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "movprfx z16, z1\n fmax z16.h, p0/M, z16.h, z0.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z31.h\n"
"fmax z16.h, p0/M, z16.h, z17.h\n"
- "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "st1h { z5.h }, p4, [%x[outptr], x9, LSL #1]\n"
"inch x9\n"
"whilelt p4.h, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 8c8532827a..51096c8f29 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,111 +89,111 @@ void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
"whilelt p0.s, XZR, x20\n"
"add x20, %x[args], %[offsetof_rescale]\n"
- "ld1rqw { z4.s }, p0/Z, [x20]\n"
"ldr x5, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.s, x3, x5\n"
"mov x6, #0x0\n"
+ "ld1rqw { z5.s }, p0/Z, [x20]\n"
"ldp x7, x8, [x21, #0x0]\n"
"ldp x17, x16, [x21, #0x10]\n"
"ldp x15, x14, [x4, #0x0]\n"
- "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
+ "whilelt p0.s, x3, x5\n"
"ldp x13, x12, [x4, #0x10]\n"
- "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
"ldp x11, x10, [x4, #0x20]\n"
- "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
"ldp x9, x28, [x4, #0x30]\n"
- "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z4.s }, p0/Z, [x14, x3, LSL #2]\n"
"ldp x27, x26, [x4, #0x40]\n"
- "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x13, x3, LSL #2]\n"
"ldp x25, x24, [x4, #0x50]\n"
- "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x10, x3, LSL #2]\n"
"ldp x23, x22, [x4, #0x60]\n"
- "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x9, x3, LSL #2]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
- "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
- "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
- "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z0.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
"whilelt p1.s, x3, x5\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.s, z1.s, z0.s\n"
- "fadd z16.s, z31.s, z30.s\n"
- "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "fadd z19.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "ld1w { z2.s }, p1/Z, [x10, x3, LSL #2]\n"
"whilelt p0.s, x6, x5\n"
- "fadd z19.s, z17.s, z16.s\n"
- "fadd z18.s, z3.s, z2.s\n"
- "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
- "fadd z17.s, z29.s, z28.s\n"
- "fadd z22.s, z27.s, z22.s\n"
- "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
- "fadd z16.s, z21.s, z20.s\n"
- "fadd z21.s, z18.s, z19.s\n"
- "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
- "fadd z20.s, z16.s, z19.s\n"
- "fadd z19.s, z26.s, z17.s\n"
- "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
- "fadd z18.s, z25.s, z22.s\n"
- "fadd z17.s, z24.s, z17.s\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
- "fadd z16.s, z23.s, z22.s\n"
- "fadd z19.s, z21.s, z19.s\n"
- "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
- "fadd z18.s, z21.s, z18.s\n"
- "fadd z17.s, z17.s, z20.s\n"
- "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
- "fadd z16.s, z16.s, z20.s\n"
- "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
- "fmul z19.s, z19.s, z4.s[0]\n"
- "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
- "fmul z18.s, z18.s, z4.s[1]\n"
- "fmul z17.s, z17.s, z4.s[2]\n"
- "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
- "fmul z16.s, z16.s, z4.s[3]\n"
+ "fadd z23.s, z4.s, z3.s\n"
+ "fadd z18.s, z30.s, z29.s\n"
+ "ld1w { z1.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "fadd z17.s, z28.s, z27.s\n"
+ "fadd z22.s, z26.s, z22.s\n"
+ "ld1w { z0.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z31.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "fadd z16.s, z19.s, z16.s\n"
+ "ld1w { z4.s }, p1/Z, [x14, x3, LSL #2]\n"
+ "fadd z19.s, z25.s, z18.s\n"
+ "fadd z21.s, z21.s, z18.s\n"
+ "ld1w { z3.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "fadd z18.s, z24.s, z17.s\n"
+ "fadd z20.s, z20.s, z17.s\n"
+ "ld1w { z30.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z29.s }, p1/Z, [x27, x3, LSL #2]\n"
+ "fadd z17.s, z23.s, z16.s\n"
+ "fadd z16.s, z22.s, z16.s\n"
+ "ld1w { z28.s }, p1/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z22.s }, p1/Z, [x21, x3, LSL #2]\n"
+ "fadd z19.s, z17.s, z19.s\n"
+ "fadd z18.s, z17.s, z18.s\n"
+ "ld1w { z25.s }, p1/Z, [x15, x3, LSL #2]\n"
+ "fadd z17.s, z21.s, z16.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
+ "ld1w { z24.s }, p1/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "incw x3\n"
+ "whilelt p1.s, x3, x5\n"
+ "fmul z19.s, z19.s, z5.s[0]\n"
+ "fmul z18.s, z18.s, z5.s[1]\n"
+ "fmul z17.s, z17.s, z5.s[2]\n"
+ "fmul z16.s, z16.s, z5.s[3]\n"
"st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
"st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
"st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
"st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
"incw x6\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
- "incw x3\n"
- "whilelt p1.s, x3, x5\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.s, z1.s, z0.s\n"
- "fadd z16.s, z31.s, z30.s\n"
+ "fadd z19.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
"whilelt p0.s, x6, x5\n"
- "fadd z19.s, z17.s, z16.s\n"
- "fadd z18.s, z3.s, z2.s\n"
- "fadd z17.s, z29.s, z28.s\n"
- "fadd z22.s, z27.s, z22.s\n"
- "fadd z16.s, z21.s, z20.s\n"
- "fadd z21.s, z18.s, z19.s\n"
- "fadd z20.s, z16.s, z19.s\n"
- "fadd z19.s, z26.s, z17.s\n"
- "fadd z18.s, z25.s, z22.s\n"
- "fadd z17.s, z24.s, z17.s\n"
- "fadd z16.s, z23.s, z22.s\n"
- "fadd z19.s, z21.s, z19.s\n"
- "fadd z18.s, z21.s, z18.s\n"
- "fadd z17.s, z17.s, z20.s\n"
- "fadd z16.s, z16.s, z20.s\n"
- "fmul z19.s, z19.s, z4.s[0]\n"
+ "fadd z23.s, z4.s, z3.s\n"
+ "fadd z18.s, z30.s, z29.s\n"
+ "fadd z17.s, z28.s, z27.s\n"
+ "fadd z22.s, z26.s, z22.s\n"
+ "fadd z16.s, z19.s, z16.s\n"
+ "fadd z19.s, z25.s, z18.s\n"
+ "fadd z21.s, z21.s, z18.s\n"
+ "fadd z18.s, z24.s, z17.s\n"
+ "fadd z20.s, z20.s, z17.s\n"
+ "fadd z17.s, z23.s, z16.s\n"
+ "fadd z16.s, z22.s, z16.s\n"
+ "fadd z19.s, z17.s, z19.s\n"
+ "fadd z18.s, z17.s, z18.s\n"
+ "fadd z17.s, z21.s, z16.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
+ "fmul z19.s, z19.s, z5.s[0]\n"
+ "fmul z18.s, z18.s, z5.s[1]\n"
+ "fmul z17.s, z17.s, z5.s[2]\n"
+ "fmul z16.s, z16.s, z5.s[3]\n"
"st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
- "fmul z18.s, z18.s, z4.s[1]\n"
- "fmul z17.s, z17.s, z4.s[2]\n"
"st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
- "fmul z16.s, z16.s, z4.s[3]\n"
"st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
"st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
".inst 0xd503467f // SMSTOP\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 86e7f84542..908c66b4d5 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,25 +49,25 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"cntw x26, ALL, MUL #3\n"
"ptrue p0.b\n"
"whilelt p3.s, x9, %x[n_channels]\n"
- "ld1rw { z6.s }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p2.s, x28, %x[n_channels]\n"
+ "ld1rw { z5.s }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p1.s, x27, %x[n_channels]\n"
"whilelt p0.s, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov x24, %x[inptrs]\n"
"mov z3.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"mov z2.b, #0x0\n"
+ "mov z1.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z23.s }, p3/Z, [x22, x9, LSL #2]\n"
"ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
@@ -84,7 +84,7 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd z23.s, z1.s, z0.s\n"
+ "fadd z23.s, z0.s, z23.s\n"
"fadd z19.s, z31.s, z30.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
@@ -94,24 +94,24 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
+ "ld1w { z23.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z18.s, z22.s, z18.s\n"
"ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
- "fadd z16.s, z20.s, z16.s\n"
"ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
- "fadd z5.s, z5.s, z19.s\n"
- "fadd z4.s, z4.s, z18.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
"ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
- "fadd z3.s, z3.s, z17.s\n"
- "fadd z2.s, z2.s, z16.s\n"
+ "fadd z4.s, z4.s, z19.s\n"
"ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fadd z3.s, z3.s, z18.s\n"
"ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "fadd z2.s, z2.s, z17.s\n"
"ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fadd z1.s, z1.s, z16.s\n"
"ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
"ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
"ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
@@ -122,7 +122,7 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd z23.s, z1.s, z0.s\n"
+ "fadd z23.s, z0.s, z23.s\n"
"fadd z19.s, z31.s, z30.s\n"
"fadd z22.s, z29.s, z22.s\n"
"fadd z18.s, z28.s, z18.s\n"
@@ -134,37 +134,37 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"fadd z18.s, z22.s, z18.s\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "fadd z5.s, z5.s, z19.s\n"
- "fadd z4.s, z4.s, z18.s\n"
- "fadd z3.s, z3.s, z17.s\n"
- "fadd z2.s, z2.s, z16.s\n"
+ "fadd z4.s, z4.s, z19.s\n"
+ "fadd z3.s, z3.s, z18.s\n"
+ "fadd z2.s, z2.s, z17.s\n"
+ "fadd z1.s, z1.s, z16.s\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
- "fadd z4.s, z4.s, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
- "fadd z3.s, z3.s, z16.s\n"
+ "ld1w { z19.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
"ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
- "fadd z2.s, z2.s, z16.s\n"
+ "fadd z4.s, z4.s, z19.s\n"
+ "fadd z3.s, z3.s, z18.s\n"
+ "fadd z2.s, z2.s, z17.s\n"
+ "fadd z1.s, z1.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul z5.s, z5.s, z6.s\n"
- "fmul z4.s, z4.s, z6.s\n"
- "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "fmul z4.s, z4.s, z5.s\n"
+ "fmul z3.s, z3.s, z5.s\n"
+ "fmul z2.s, z2.s, z5.s\n"
+ "fmul z1.s, z1.s, z5.s\n"
+ "st1w { z4.s }, p3, [%x[outptr], x9, LSL #2]\n"
"incw x9, ALL, MUL #4\n"
- "fmul z3.s, z3.s, z6.s\n"
- "fmul z2.s, z2.s, z6.s\n"
- "st1w { z4.s }, p2, [%x[outptr], x28, LSL #2]\n"
+ "st1w { z3.s }, p2, [%x[outptr], x28, LSL #2]\n"
"incw x28, ALL, MUL #4\n"
- "st1w { z3.s }, p1, [%x[outptr], x27, LSL #2]\n"
+ "st1w { z2.s }, p1, [%x[outptr], x27, LSL #2]\n"
"incw x27, ALL, MUL #4\n"
- "st1w { z2.s }, p0, [%x[outptr], x26, LSL #2]\n"
+ "st1w { z1.s }, p0, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
"whilelt p0.s, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -173,49 +173,49 @@ void sme_fp32_nhwc_avg_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z5.b, #0x0\n"
+ "mov z4.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z23.s }, p3/Z, [x22, x9, LSL #2]\n"
"ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z17.s, z1.s, z0.s\n"
+ "fadd z17.s, z0.s, z23.s\n"
"fadd z16.s, z31.s, z30.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fadd z16.s, z17.s, z16.s\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fadd z5.s, z5.s, z16.s\n"
"add x24, x24, #0x20\n"
- "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "ld1w { z23.s }, p3/Z, [x22, x9, LSL #2]\n"
"ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
"ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fadd z4.s, z4.s, z16.s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z17.s, z1.s, z0.s\n"
+ "fadd z17.s, z0.s, z23.s\n"
"fadd z16.s, z31.s, z30.s\n"
"fadd z16.s, z17.s, z16.s\n"
- "fadd z5.s, z5.s, z16.s\n"
+ "fadd z4.s, z4.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z5.s, z5.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fadd z4.s, z4.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul z5.s, z5.s, z6.s\n"
- "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "fmul z4.s, z4.s, z5.s\n"
+ "st1w { z4.s }, p3, [%x[outptr], x9, LSL #2]\n"
"incw x9\n"
"whilelt p3.s, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 3c7213a498..e460009bdf 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,26 +66,26 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"mov x15, #0x0\n"
- "ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ptrue p2.b\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.s, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
+ "whilelt p0.s, x15, x13\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
"ldr x20, [x20, #0x40]\n"
+ "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
"ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
"ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
"ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
- "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x15, LSL #2]\n"
"ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
@@ -98,24 +98,24 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
"movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
"ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
- "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z20.s\n"
"movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
"ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
"ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
"movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
"movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
- "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
"movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
"movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
- "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
- "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
- "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
- "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
- "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x22, x15, LSL #2]\n"
"ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
"incw x15\n"
"whilelt p1.s, x15, x13\n"
+ "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+ "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+ "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
"incw x14\n"
"b.any 1b\n"
@@ -123,15 +123,15 @@ void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
"movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
"whilelt p0.s, x14, x13\n"
- "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
- "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
- "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
- "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
- "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
- "fmax z18.s, p2/M, z18.s, z22.s\n"
- "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
- "fmax z17.s, p2/M, z17.s, z21.s\n"
- "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+ "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
"st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
"st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
"st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 0dabc2f292..6d2641b035 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,21 +53,21 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.s, #0xff800000\n"
"mov z4.s, #0xff800000\n"
- "mov z3.s, #0xff800000\n"
"mov x24, %x[inptrs]\n"
+ "mov z3.s, #0xff800000\n"
"mov z2.s, #0xff800000\n"
- "mov z1.s, #0xff800000\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x22, x9, LSL #2]\n"
"ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
- "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x23, x28, LSL #2]\n"
"ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
"ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
"ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
@@ -81,34 +81,34 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
+ "movprfx z19, z1\n fmax z19.s, p0/M, z19.s, z0.s\n"
+ "fmax z23.s, p0/M, z23.s, z31.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z18.s, p0/M, z18.s, z29.s\n"
+ "movprfx z18, z30\n fmax z18.s, p0/M, z18.s, z29.s\n"
"fmax z22.s, p0/M, z22.s, z28.s\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"fmax z17.s, p0/M, z17.s, z27.s\n"
"fmax z21.s, p0/M, z21.s, z26.s\n"
- "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
"fmax z16.s, p0/M, z16.s, z25.s\n"
"fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x23, x9, LSL #2]\n"
"fmax z19.s, p0/M, z19.s, z23.s\n"
+ "ld1w { z0.s }, p4/Z, [x22, x9, LSL #2]\n"
"fmax z18.s, p0/M, z18.s, z22.s\n"
"ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
+ "ld1w { z31.s }, p4/Z, [x20, x9, LSL #2]\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
- "fmax z3.s, p0/M, z3.s, z18.s\n"
- "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fmax z2.s, p0/M, z2.s, z17.s\n"
- "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "ld1w { z30.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z19.s\n"
"ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "fmax z4.s, p0/M, z4.s, z18.s\n"
"ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "fmax z3.s, p0/M, z3.s, z17.s\n"
"ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
"ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
"ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
"ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
@@ -119,9 +119,9 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
- "fmax z23.s, p0/M, z23.s, z30.s\n"
- "fmax z18.s, p0/M, z18.s, z29.s\n"
+ "movprfx z19, z1\n fmax z19.s, p0/M, z19.s, z0.s\n"
+ "fmax z23.s, p0/M, z23.s, z31.s\n"
+ "movprfx z18, z30\n fmax z18.s, p0/M, z18.s, z29.s\n"
"fmax z22.s, p0/M, z22.s, z28.s\n"
"fmax z17.s, p0/M, z17.s, z27.s\n"
"fmax z21.s, p0/M, z21.s, z26.s\n"
@@ -131,33 +131,33 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"fmax z18.s, p0/M, z18.s, z22.s\n"
"fmax z17.s, p0/M, z17.s, z21.s\n"
"fmax z16.s, p0/M, z16.s, z20.s\n"
- "fmax z4.s, p0/M, z4.s, z19.s\n"
- "fmax z3.s, p0/M, z3.s, z18.s\n"
- "fmax z2.s, p0/M, z2.s, z17.s\n"
- "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "fmax z5.s, p0/M, z5.s, z19.s\n"
+ "fmax z4.s, p0/M, z4.s, z18.s\n"
+ "fmax z3.s, p0/M, z3.s, z17.s\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z16.s\n"
- "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
- "fmax z3.s, p0/M, z3.s, z16.s\n"
- "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
- "fmax z2.s, p0/M, z2.s, z16.s\n"
+ "ld1w { z19.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x27, LSL #2]\n"
"ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
- "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "fmax z5.s, p0/M, z5.s, z19.s\n"
+ "fmax z4.s, p0/M, z4.s, z18.s\n"
+ "fmax z3.s, p0/M, z3.s, z17.s\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "st1w { z5.s }, p4, [%x[outptr], x9, LSL #2]\n"
"incw x9, ALL, MUL #4\n"
- "st1w { z3.s }, p3, [%x[outptr], x28, LSL #2]\n"
+ "st1w { z4.s }, p3, [%x[outptr], x28, LSL #2]\n"
"incw x28, ALL, MUL #4\n"
- "st1w { z2.s }, p2, [%x[outptr], x27, LSL #2]\n"
+ "st1w { z3.s }, p2, [%x[outptr], x27, LSL #2]\n"
"incw x27, ALL, MUL #4\n"
- "st1w { z1.s }, p1, [%x[outptr], x26, LSL #2]\n"
+ "st1w { z2.s }, p1, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
"whilelt p1.s, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -166,48 +166,48 @@ void sme_fp32_nhwc_max_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.s, #0xff800000\n"
+ "mov z5.s, #0xff800000\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p4/Z, [x22, x9, LSL #2]\n"
"ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
- "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "movprfx z16, z1\n fmax z16.s, p0/M, z16.s, z0.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z31.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "fmax z16.s, p0/M, z16.s, z17.s\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fmax z4.s, p0/M, z4.s, z16.s\n"
"add x24, x24, #0x20\n"
- "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "ld1w { z0.s }, p4/Z, [x22, x9, LSL #2]\n"
"ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
- "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "movprfx z16, z1\n fmax z16.s, p0/M, z16.s, z0.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z31.s\n"
"fmax z16.s, p0/M, z16.s, z17.s\n"
- "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "st1w { z5.s }, p4, [%x[outptr], x9, LSL #2]\n"
"incw x9\n"
"whilelt p4.s, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
index c24e977dc6..b931767710 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -146,32 +146,32 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 2b\n"
@@ -205,17 +205,17 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z16.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
- ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z19.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
"ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a277 // sshllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508a676 // sshllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508a255 // sshllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508a654 // sshllt z20.h, z18.b, #0x0\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
@@ -236,25 +236,25 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z18.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "mov z17.s, #0x7f\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
- ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
- ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
- ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
- ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
- ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
- ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
- ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
- ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
- ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
- ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
- ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
- "mov z19.s, #0x7f\n"
+ ".inst 0x04b275ef // sqdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b2756b // sqdmulh z11.s, z11.s, z18.s\n"
+ ".inst 0x04b2754a // sqdmulh z10.s, z10.s, z18.s\n"
+ ".inst 0x04b27529 // sqdmulh z9.s, z9.s, z18.s\n"
+ ".inst 0x04b27508 // sqdmulh z8.s, z8.s, z18.s\n"
+ ".inst 0x04b274e7 // sqdmulh z7.s, z7.s, z18.s\n"
+ ".inst 0x04b274c6 // sqdmulh z6.s, z6.s, z18.s\n"
+ ".inst 0x04b274a5 // sqdmulh z5.s, z5.s, z18.s\n"
+ ".inst 0x04b27484 // sqdmulh z4.s, z4.s, z18.s\n"
+ ".inst 0x04b27463 // sqdmulh z3.s, z3.s, z18.s\n"
+ ".inst 0x04b27442 // sqdmulh z2.s, z2.s, z18.s\n"
+ ".inst 0x04b27421 // sqdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27400 // sqdmulh z0.s, z0.s, z18.s\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
@@ -271,7 +271,7 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z17.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
@@ -288,36 +288,36 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"smax z2.s, p0/M, z2.s, z16.s\n"
"smax z1.s, p0/M, z1.s, z16.s\n"
"smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z17.s\n"
+ "smin z14.s, p0/M, z14.s, z17.s\n"
+ "smin z13.s, p0/M, z13.s, z17.s\n"
+ "smin z12.s, p0/M, z12.s, z17.s\n"
+ "smin z11.s, p0/M, z11.s, z17.s\n"
+ "smin z10.s, p0/M, z10.s, z17.s\n"
+ "smin z9.s, p0/M, z9.s, z17.s\n"
+ "smin z8.s, p0/M, z8.s, z17.s\n"
+ "smin z7.s, p0/M, z7.s, z17.s\n"
+ "smin z6.s, p0/M, z6.s, z17.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z5.s, p0/M, z5.s, z17.s\n"
+ "smin z4.s, p0/M, z4.s, z17.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p0/M, z11.s, z19.s\n"
- "smin z10.s, p0/M, z10.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z17.s\n"
+ "smin z2.s, p0/M, z2.s, z17.s\n"
"trn1 z22.h, z11.h, z10.h\n"
- "smin z9.s, p0/M, z9.s, z19.s\n"
- "smin z8.s, p0/M, z8.s, z19.s\n"
+ "smin z1.s, p0/M, z1.s, z17.s\n"
+ "smin z0.s, p0/M, z0.s, z17.s\n"
"trn1 z18.h, z9.h, z8.h\n"
- "smin z7.s, p0/M, z7.s, z19.s\n"
- "smin z6.s, p0/M, z6.s, z19.s\n"
"trn1 z21.h, z7.h, z6.h\n"
- "smin z5.s, p0/M, z5.s, z19.s\n"
- "smin z4.s, p0/M, z4.s, z19.s\n"
"trn1 z17.h, z5.h, z4.h\n"
- "smin z3.s, p0/M, z3.s, z19.s\n"
- "smin z2.s, p0/M, z2.s, z19.s\n"
- "trn1 z20.h, z3.h, z2.h\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z19.h, z1.h, z0.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z16.h, z1.h, z0.h\n"
"trn1 z18.b, z22.b, z18.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "incb x27, ALL, MUL #4\n"
"trn1 z17.b, z21.b, z17.b\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z20.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x25]\n"
@@ -348,13 +348,13 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
- ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
"add x22, x22, #0x10\n"
"ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
@@ -368,10 +368,10 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x21, x21, #0x1\n"
"ld1b { z16.b }, p4/Z, [x20, x27]\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
@@ -379,26 +379,26 @@ void sme_s8_nhwc_avg_generic_depthfirst_impl(
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "mov z18.s, #0x7f\n"
+ "ld1rw { z17.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
- "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
"not z16.s, p0/M, z18.s\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
"smin z15.s, p0/M, z15.s, z18.s\n"
"smin z14.s, p0/M, z14.s, z18.s\n"
- "trn1 z17.h, z15.h, z14.h\n"
"smin z13.s, p0/M, z13.s, z18.s\n"
"smin z12.s, p0/M, z12.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 96617566a8..f139b834c6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,26 +66,26 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"mov x15, #0x0\n"
- "ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ptrue p2.b\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldr x20, [x20, #0x40]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ld1b { z26.b }, p0/Z, [x28, x15]\n"
"ld1b { z25.b }, p0/Z, [x26, x15]\n"
"ld1b { z24.b }, p0/Z, [x23, x15]\n"
- "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x15]\n"
"ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
@@ -98,24 +98,24 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
"movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z20.b\n"
"movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p1/Z, [x26, x15]\n"
"movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
"movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "ld1b { z24.b }, p1/Z, [x23, x15]\n"
"movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
"movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z19.b }, p1/Z, [x22, x15]\n"
- "st1b { z17.b }, p0, [x10, x14]\n"
+ "ld1b { z20.b }, p1/Z, [x22, x15]\n"
"ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
"incw x14\n"
"b.any 1b\n"
@@ -123,15 +123,15 @@ void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
"movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
- "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
- "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
- "smax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z16.b }, p0, [x12, x14]\n"
- "smax z17.b, p2/M, z17.b, z21.b\n"
- "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
index d2b45cd353..5cf60e9315 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,21 +53,21 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x80\n"
"mov z4.b, #0x80\n"
- "mov z3.b, #0x80\n"
"mov x24, %x[inptrs]\n"
+ "mov z3.b, #0x80\n"
"mov z2.b, #0x80\n"
- "mov z1.b, #0x80\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
@@ -81,34 +81,34 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
+ "movprfx z19, z1\n smax z19.b, p0/M, z19.b, z0.b\n"
+ "smax z23.b, p0/M, z23.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z18, z30\n smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "smax z4.b, p0/M, z4.b, z18.b\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "smax z3.b, p0/M, z3.b, z17.b\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
"ld1b { z17.b }, p2/Z, [x23, x27]\n"
"ld1b { z27.b }, p2/Z, [x22, x27]\n"
"ld1b { z21.b }, p2/Z, [x21, x27]\n"
@@ -119,9 +119,9 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z19, z1\n smax z19.b, p0/M, z19.b, z0.b\n"
+ "smax z23.b, p0/M, z23.b, z31.b\n"
+ "movprfx z18, z30\n smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
@@ -131,33 +131,33 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"smax z18.b, p0/M, z18.b, z22.b\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
+ "smax z4.b, p0/M, z4.b, z18.b\n"
+ "smax z3.b, p0/M, z3.b, z17.b\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
- "ld1b { z16.b }, p3/Z, [x20, x28]\n"
- "smax z3.b, p0/M, z3.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z19.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x27]\n"
"ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
+ "smax z4.b, p0/M, z4.b, z18.b\n"
+ "smax z3.b, p0/M, z3.b, z17.b\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z5.b }, p4, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
- "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+ "st1b { z4.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
- "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+ "st1b { z3.b }, p2, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
- "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+ "st1b { z2.b }, p1, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"whilelt p1.b, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -166,48 +166,48 @@ void sme_s8_nhwc_max_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.b, #0x80\n"
+ "mov z5.b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x20, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n smax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
"ldp x21, x20, [x24, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n smax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z31.b\n"
"smax z16.b, p0/M, z16.b, z17.b\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z5.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 91f2f7ab31..c4a6290dac 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -165,32 +165,32 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 2b\n"
@@ -224,17 +224,17 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z16.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
- ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z19.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
"ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a277 // sshllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508a676 // sshllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508a255 // sshllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508a654 // sshllt z20.h, z18.b, #0x0\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
@@ -255,25 +255,26 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
+ "mov z18.s, #0x7f\n"
"ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
- ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
"ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
- ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
- ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
- ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
- ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
- ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
- ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
- ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
- ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
- ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
+ ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
+ ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
+ ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
+ ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
+ ".inst 0x4482826b // srshl z11.s, p0/M, z11.s, z19.s\n"
+ ".inst 0x4482826a // srshl z10.s, p0/M, z10.s, z19.s\n"
+ ".inst 0x44828269 // srshl z9.s, p0/M, z9.s, z19.s\n"
+ ".inst 0x44828268 // srshl z8.s, p0/M, z8.s, z19.s\n"
+ ".inst 0x44828267 // srshl z7.s, p0/M, z7.s, z19.s\n"
+ ".inst 0x44828266 // srshl z6.s, p0/M, z6.s, z19.s\n"
+ ".inst 0x44828265 // srshl z5.s, p0/M, z5.s, z19.s\n"
+ ".inst 0x44828264 // srshl z4.s, p0/M, z4.s, z19.s\n"
+ ".inst 0x44828263 // srshl z3.s, p0/M, z3.s, z19.s\n"
+ ".inst 0x44828262 // srshl z2.s, p0/M, z2.s, z19.s\n"
+ ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
+ ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
@@ -290,7 +291,6 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x04b17442 // sqrdmulh z2.s, z2.s, z17.s\n"
".inst 0x04b17421 // sqrdmulh z1.s, z1.s, z17.s\n"
".inst 0x04b17400 // sqrdmulh z0.s, z0.s, z17.s\n"
- "mov z19.s, #0x7f\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
@@ -307,7 +307,7 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- "not z16.s, p0/M, z19.s\n"
+ "not z16.s, p0/M, z18.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
@@ -324,36 +324,36 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"smax z2.s, p0/M, z2.s, z16.s\n"
"smax z1.s, p0/M, z1.s, z16.s\n"
"smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z11.s, p0/M, z11.s, z18.s\n"
+ "smin z10.s, p0/M, z10.s, z18.s\n"
+ "smin z9.s, p0/M, z9.s, z18.s\n"
+ "smin z8.s, p0/M, z8.s, z18.s\n"
+ "smin z7.s, p0/M, z7.s, z18.s\n"
+ "smin z6.s, p0/M, z6.s, z18.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z5.s, p0/M, z5.s, z18.s\n"
+ "smin z4.s, p0/M, z4.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p0/M, z11.s, z19.s\n"
- "smin z10.s, p0/M, z10.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z18.s\n"
+ "smin z2.s, p0/M, z2.s, z18.s\n"
"trn1 z22.h, z11.h, z10.h\n"
- "smin z9.s, p0/M, z9.s, z19.s\n"
- "smin z8.s, p0/M, z8.s, z19.s\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
"trn1 z18.h, z9.h, z8.h\n"
- "smin z7.s, p0/M, z7.s, z19.s\n"
- "smin z6.s, p0/M, z6.s, z19.s\n"
"trn1 z21.h, z7.h, z6.h\n"
- "smin z5.s, p0/M, z5.s, z19.s\n"
- "smin z4.s, p0/M, z4.s, z19.s\n"
"trn1 z17.h, z5.h, z4.h\n"
- "smin z3.s, p0/M, z3.s, z19.s\n"
- "smin z2.s, p0/M, z2.s, z19.s\n"
- "trn1 z20.h, z3.h, z2.h\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z19.h, z1.h, z0.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z16.h, z1.h, z0.h\n"
"trn1 z18.b, z22.b, z18.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "incb x27, ALL, MUL #4\n"
"trn1 z17.b, z21.b, z17.b\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z20.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x25]\n"
@@ -384,13 +384,13 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
- ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
"add x22, x22, #0x10\n"
"ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
@@ -404,10 +404,10 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x21, x21, #0x1\n"
"ld1b { z16.b }, p4/Z, [x20, x27]\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
@@ -415,31 +415,31 @@ void sme_s8q_nhwc_avg_generic_depthfirst_impl(
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
"ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ "mov z19.s, #0x7f\n"
+ "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
- "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
+ "not z16.s, p0/M, z19.s\n"
+ ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
"smax z15.s, p0/M, z15.s, z16.s\n"
"smax z14.s, p0/M, z14.s, z16.s\n"
"smax z13.s, p0/M, z13.s, z16.s\n"
"smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
index e9b586f4ce..6895fd2011 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,21 +55,21 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.b, #0x80\n"
+ "mov z5.b, #0x80\n"
"mov z3.b, #0x80\n"
"mov x24, %x[inptrs]\n"
"mov z2.b, #0x80\n"
- "mov z1.b, #0x80\n"
+ "mov z4.b, #0x80\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
@@ -83,34 +83,34 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
+ "movprfx z19, z1\n smax z19.b, p0/M, z19.b, z0.b\n"
+ "smax z23.b, p0/M, z23.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z18, z30\n smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"smax z16.b, p0/M, z16.b, z25.b\n"
"smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
"smax z19.b, p0/M, z19.b, z23.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"smax z18.b, p0/M, z18.b, z22.b\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
- "smax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
- "smax z2.b, p0/M, z2.b, z17.b\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"ld1b { z17.b }, p2/Z, [x23, x27]\n"
"ld1b { z27.b }, p2/Z, [x22, x27]\n"
"ld1b { z21.b }, p2/Z, [x21, x27]\n"
@@ -121,9 +121,9 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
- "smax z23.b, p0/M, z23.b, z30.b\n"
- "smax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z19, z1\n smax z19.b, p0/M, z19.b, z0.b\n"
+ "smax z23.b, p0/M, z23.b, z31.b\n"
+ "movprfx z18, z30\n smax z18.b, p0/M, z18.b, z29.b\n"
"smax z22.b, p0/M, z22.b, z28.b\n"
"smax z17.b, p0/M, z17.b, z27.b\n"
"smax z21.b, p0/M, z21.b, z26.b\n"
@@ -133,108 +133,108 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"smax z18.b, p0/M, z18.b, z22.b\n"
"smax z17.b, p0/M, z17.b, z21.b\n"
"smax z16.b, p0/M, z16.b, z20.b\n"
- "smax z4.b, p0/M, z4.b, z19.b\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
"smax z3.b, p0/M, z3.b, z18.b\n"
"smax z2.b, p0/M, z2.b, z17.b\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
- "ld1b { z16.b }, p3/Z, [x20, x28]\n"
- "smax z3.b, p0/M, z3.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z19.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x27]\n"
"ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "smax z1.b, p0/M, z1.b, z16.b\n"
+ "smax z5.b, p0/M, z5.b, z19.b\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
- ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
+ ".inst 0x4508a0b3 // sshllb z19.h, z5.b, #0x0\n"
+ ".inst 0x4508a4b8 // sshllt z24.h, z5.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a075 // sshllb z21.h, z3.b, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ ".inst 0x4508a076 // sshllb z22.h, z3.b, #0x0\n"
".inst 0x4508a472 // sshllt z18.h, z3.b, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4508a054 // sshllb z20.h, z2.b, #0x0\n"
- ".inst 0x4508a451 // sshllt z17.h, z2.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x4508a033 // sshllb z19.h, z1.b, #0x0\n"
- ".inst 0x4508a430 // sshllt z16.h, z1.b, #0x0\n"
- ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
- ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
- ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
- ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
- ".inst 0x4510a2be // sshllb z30.s, z21.h, #0x0\n"
- ".inst 0x4510a6b6 // sshllt z22.s, z21.h, #0x0\n"
+ ".inst 0x4508a055 // sshllb z21.h, z2.b, #0x0\n"
+ ".inst 0x4508a454 // sshllt z20.h, z2.b, #0x0\n"
+ "ld1rw { z2.s }, p0/Z, [x21]\n"
+ ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
+ ".inst 0x4508a491 // sshllt z17.h, z4.b, #0x0\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a261 // sshllb z1.s, z19.h, #0x0\n"
+ ".inst 0x4510a673 // sshllt z19.s, z19.h, #0x0\n"
+ ".inst 0x4510a300 // sshllb z0.s, z24.h, #0x0\n"
+ ".inst 0x4510a71f // sshllt z31.s, z24.h, #0x0\n"
+ ".inst 0x4510a2de // sshllb z30.s, z22.h, #0x0\n"
+ ".inst 0x4510a6d6 // sshllt z22.s, z22.h, #0x0\n"
".inst 0x4510a25d // sshllb z29.s, z18.h, #0x0\n"
".inst 0x4510a652 // sshllt z18.s, z18.h, #0x0\n"
- ".inst 0x4510a29c // sshllb z28.s, z20.h, #0x0\n"
- ".inst 0x4510a695 // sshllt z21.s, z20.h, #0x0\n"
- ".inst 0x4510a23b // sshllb z27.s, z17.h, #0x0\n"
- ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
- ".inst 0x4510a27a // sshllb z26.s, z19.h, #0x0\n"
- ".inst 0x4510a674 // sshllt z20.s, z19.h, #0x0\n"
- ".inst 0x4510a219 // sshllb z25.s, z16.h, #0x0\n"
- ".inst 0x4510a618 // sshllt z24.s, z16.h, #0x0\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x4482809e // srshl z30.s, p0/M, z30.s, z4.s\n"
- ".inst 0x44828096 // srshl z22.s, p0/M, z22.s, z4.s\n"
- ".inst 0x4482809d // srshl z29.s, p0/M, z29.s, z4.s\n"
- ".inst 0x44828092 // srshl z18.s, p0/M, z18.s, z4.s\n"
- ".inst 0x4482809c // srshl z28.s, p0/M, z28.s, z4.s\n"
- ".inst 0x44828095 // srshl z21.s, p0/M, z21.s, z4.s\n"
- ".inst 0x4482809b // srshl z27.s, p0/M, z27.s, z4.s\n"
- ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
- ".inst 0x4482809a // srshl z26.s, p0/M, z26.s, z4.s\n"
- ".inst 0x44828094 // srshl z20.s, p0/M, z20.s, z4.s\n"
- ".inst 0x44828099 // srshl z25.s, p0/M, z25.s, z4.s\n"
- ".inst 0x44828098 // srshl z24.s, p0/M, z24.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
- ".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
- ".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
- ".inst 0x04a37652 // sqrdmulh z18.s, z18.s, z3.s\n"
- ".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
- ".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
- ".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
- ".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
- ".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
- ".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
- ".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
- "mov z19.s, #0x7f\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
- ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
- ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
- ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
- ".inst 0x44828052 // srshl z18.s, p0/M, z18.s, z2.s\n"
- ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
- ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
- ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
- ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
- ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
- ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
- ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
- ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
- "not z16.s, p0/M, z19.s\n"
+ ".inst 0x4510a2bc // sshllb z28.s, z21.h, #0x0\n"
+ ".inst 0x4510a6b5 // sshllt z21.s, z21.h, #0x0\n"
+ ".inst 0x4510a29b // sshllb z27.s, z20.h, #0x0\n"
+ ".inst 0x4510a694 // sshllt z20.s, z20.h, #0x0\n"
+ ".inst 0x4510a2fa // sshllb z26.s, z23.h, #0x0\n"
+ ".inst 0x4510a6f9 // sshllt z25.s, z23.h, #0x0\n"
+ ".inst 0x4510a238 // sshllb z24.s, z17.h, #0x0\n"
+ ".inst 0x4510a637 // sshllt z23.s, z17.h, #0x0\n"
+ ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
+ ".inst 0x44828073 // srshl z19.s, p0/M, z19.s, z3.s\n"
+ ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
+ ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
+ ".inst 0x4482807e // srshl z30.s, p0/M, z30.s, z3.s\n"
+ ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
+ ".inst 0x4482807d // srshl z29.s, p0/M, z29.s, z3.s\n"
+ ".inst 0x44828072 // srshl z18.s, p0/M, z18.s, z3.s\n"
+ ".inst 0x4482807c // srshl z28.s, p0/M, z28.s, z3.s\n"
+ ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
+ ".inst 0x4482807b // srshl z27.s, p0/M, z27.s, z3.s\n"
+ ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
+ ".inst 0x4482807a // srshl z26.s, p0/M, z26.s, z3.s\n"
+ ".inst 0x44828079 // srshl z25.s, p0/M, z25.s, z3.s\n"
+ ".inst 0x44828078 // srshl z24.s, p0/M, z24.s, z3.s\n"
+ ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ ".inst 0x04a27673 // sqrdmulh z19.s, z19.s, z2.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ ".inst 0x04a277bd // sqrdmulh z29.s, z29.s, z2.s\n"
+ ".inst 0x04a27652 // sqrdmulh z18.s, z18.s, z2.s\n"
+ ".inst 0x04a2779c // sqrdmulh z28.s, z28.s, z2.s\n"
+ ".inst 0x04a276b5 // sqrdmulh z21.s, z21.s, z2.s\n"
+ ".inst 0x04a2777b // sqrdmulh z27.s, z27.s, z2.s\n"
+ ".inst 0x04a27694 // sqrdmulh z20.s, z20.s, z2.s\n"
+ ".inst 0x04a2775a // sqrdmulh z26.s, z26.s, z2.s\n"
+ ".inst 0x04a27739 // sqrdmulh z25.s, z25.s, z2.s\n"
+ ".inst 0x04a27718 // sqrdmulh z24.s, z24.s, z2.s\n"
+ ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
+ "mov z17.s, #0x7f\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ ".inst 0x4482821f // srshl z31.s, p0/M, z31.s, z16.s\n"
+ ".inst 0x4482821e // srshl z30.s, p0/M, z30.s, z16.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x4482821d // srshl z29.s, p0/M, z29.s, z16.s\n"
+ ".inst 0x44828212 // srshl z18.s, p0/M, z18.s, z16.s\n"
+ ".inst 0x4482821c // srshl z28.s, p0/M, z28.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x4482821b // srshl z27.s, p0/M, z27.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x4482821a // srshl z26.s, p0/M, z26.s, z16.s\n"
+ ".inst 0x44828219 // srshl z25.s, p0/M, z25.s, z16.s\n"
+ ".inst 0x44828218 // srshl z24.s, p0/M, z24.s, z16.s\n"
+ ".inst 0x44828217 // srshl z23.s, p0/M, z23.s, z16.s\n"
+ "not z16.s, p0/M, z17.s\n"
"smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
"smax z0.s, p0/M, z0.s, z16.s\n"
"smax z31.s, p0/M, z31.s, z16.s\n"
"smax z30.s, p0/M, z30.s, z16.s\n"
@@ -244,41 +244,41 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"smax z28.s, p0/M, z28.s, z16.s\n"
"smax z21.s, p0/M, z21.s, z16.s\n"
"smax z27.s, p0/M, z27.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z26.s, p0/M, z26.s, z16.s\n"
"smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z26.s, p0/M, z26.s, z16.s\n"
"smax z25.s, p0/M, z25.s, z16.s\n"
"smax z24.s, p0/M, z24.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
+ "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smin z1.s, p0/M, z1.s, z17.s\n"
+ "smin z19.s, p0/M, z19.s, z17.s\n"
+ "smin z0.s, p0/M, z0.s, z17.s\n"
+ "smin z31.s, p0/M, z31.s, z17.s\n"
+ "smin z30.s, p0/M, z30.s, z17.s\n"
+ "smin z22.s, p0/M, z22.s, z17.s\n"
+ "smin z29.s, p0/M, z29.s, z17.s\n"
+ "smin z18.s, p0/M, z18.s, z17.s\n"
+ "smin z28.s, p0/M, z28.s, z17.s\n"
+ "smin z21.s, p0/M, z21.s, z17.s\n"
+ "trn1 z19.h, z1.h, z19.h\n"
+ "smin z27.s, p0/M, z27.s, z17.s\n"
+ "smin z20.s, p0/M, z20.s, z17.s\n"
"trn1 z16.h, z0.h, z31.h\n"
- "smin z30.s, p0/M, z30.s, z19.s\n"
- "smin z22.s, p0/M, z22.s, z19.s\n"
+ "smin z26.s, p0/M, z26.s, z17.s\n"
+ "smin z25.s, p0/M, z25.s, z17.s\n"
"trn1 z22.h, z30.h, z22.h\n"
- "smin z29.s, p0/M, z29.s, z19.s\n"
- "smin z18.s, p0/M, z18.s, z19.s\n"
+ "smin z24.s, p0/M, z24.s, z17.s\n"
+ "smin z23.s, p0/M, z23.s, z17.s\n"
"trn1 z18.h, z29.h, z18.h\n"
- "smin z28.s, p0/M, z28.s, z19.s\n"
- "smin z21.s, p0/M, z21.s, z19.s\n"
"trn1 z21.h, z28.h, z21.h\n"
- "smin z27.s, p0/M, z27.s, z19.s\n"
- "smin z17.s, p0/M, z17.s, z19.s\n"
- "trn1 z17.h, z27.h, z17.h\n"
- "smin z26.s, p0/M, z26.s, z19.s\n"
- "smin z20.s, p0/M, z20.s, z19.s\n"
- "trn1 z20.h, z26.h, z20.h\n"
- "smin z25.s, p0/M, z25.s, z19.s\n"
- "smin z24.s, p0/M, z24.s, z19.s\n"
- "trn1 z19.h, z25.h, z24.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z17.h, z27.h, z20.h\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z26.h, z25.h\n"
+ "trn1 z16.h, z24.h, z23.h\n"
"trn1 z18.b, z22.b, z18.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
- "incb x9, ALL, MUL #4\n"
"trn1 z17.b, z21.b, z17.b\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z20.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x27]\n"
@@ -292,83 +292,83 @@ void sme_s8q_nhwc_max_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.b, #0x80\n"
+ "mov z5.b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x20, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n smax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
"ldp x21, x20, [x24, #0x10]\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n smax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z31.b\n"
"smax z16.b, p0/M, z16.b, z17.b\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- ".inst 0x4508a091 // sshllb z17.h, z4.b, #0x0\n"
- ".inst 0x4508a490 // sshllt z16.h, z4.b, #0x0\n"
+ ".inst 0x4508a0b1 // sshllb z17.h, z5.b, #0x0\n"
+ ".inst 0x4508a4b0 // sshllt z16.h, z5.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z18.s }, p0/Z, [x20]\n"
- ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
- ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
- ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z24.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x44828256 // srshl z22.s, p0/M, z22.s, z18.s\n"
- ".inst 0x44828255 // srshl z21.s, p0/M, z21.s, z18.s\n"
- ".inst 0x44828254 // srshl z20.s, p0/M, z20.s, z18.s\n"
- ".inst 0x44828253 // srshl z19.s, p0/M, z19.s, z18.s\n"
- ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
- ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
- ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
- ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
- ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
- ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
- ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z22.s, p0/M, z22.s, z16.s\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
+ "mov z23.s, #0x7f\n"
+ "ld1rw { z22.s }, p0/Z, [x21]\n"
+ "ld1rw { z21.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a234 // sshllb z20.s, z17.h, #0x0\n"
+ ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
+ ".inst 0x4510a213 // sshllb z19.s, z16.h, #0x0\n"
+ ".inst 0x4510a612 // sshllt z18.s, z16.h, #0x0\n"
+ "not z16.s, p0/M, z23.s\n"
+ ".inst 0x44828314 // srshl z20.s, p0/M, z20.s, z24.s\n"
+ ".inst 0x44828311 // srshl z17.s, p0/M, z17.s, z24.s\n"
+ ".inst 0x44828313 // srshl z19.s, p0/M, z19.s, z24.s\n"
+ ".inst 0x44828312 // srshl z18.s, p0/M, z18.s, z24.s\n"
+ ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
+ ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
+ ".inst 0x448282b4 // srshl z20.s, p0/M, z20.s, z21.s\n"
+ ".inst 0x448282b1 // srshl z17.s, p0/M, z17.s, z21.s\n"
+ ".inst 0x448282b3 // srshl z19.s, p0/M, z19.s, z21.s\n"
+ ".inst 0x448282b2 // srshl z18.s, p0/M, z18.s, z21.s\n"
"smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
"smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z22.s, p0/M, z22.s, z18.s\n"
- "smin z21.s, p0/M, z21.s, z18.s\n"
- "trn1 z17.h, z22.h, z21.h\n"
- "smin z20.s, p0/M, z20.s, z18.s\n"
- "smin z19.s, p0/M, z19.s, z18.s\n"
- "trn1 z16.h, z20.h, z19.h\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z23.s\n"
+ "smin z17.s, p0/M, z17.s, z23.s\n"
+ "smin z19.s, p0/M, z19.s, z23.s\n"
+ "smin z18.s, p0/M, z18.s, z23.s\n"
+ "trn1 z17.h, z20.h, z17.h\n"
+ "trn1 z16.h, z19.h, z18.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
index f0e7bbf5cc..0aa6fc8881 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -146,32 +146,32 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 2b\n"
@@ -205,17 +205,17 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z16.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
- ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z19.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
"ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa77 // ushllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508ae76 // ushllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508aa55 // ushllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508ae54 // ushllt z20.h, z18.b, #0x0\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
@@ -236,24 +236,26 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z19.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0xff\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
- ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
- ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
- ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
- ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
- ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
- ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
- ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
- ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
- ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
- ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
- ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
+ ".inst 0x04b375ef // sqdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x04b375ce // sqdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b3758c // sqdmulh z12.s, z12.s, z19.s\n"
+ ".inst 0x04b3756b // sqdmulh z11.s, z11.s, z19.s\n"
+ ".inst 0x04b3754a // sqdmulh z10.s, z10.s, z19.s\n"
+ ".inst 0x04b37529 // sqdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x04b37508 // sqdmulh z8.s, z8.s, z19.s\n"
+ ".inst 0x04b374e7 // sqdmulh z7.s, z7.s, z19.s\n"
+ ".inst 0x04b374c6 // sqdmulh z6.s, z6.s, z19.s\n"
+ ".inst 0x04b374a5 // sqdmulh z5.s, z5.s, z19.s\n"
+ ".inst 0x04b37484 // sqdmulh z4.s, z4.s, z19.s\n"
+ ".inst 0x04b37463 // sqdmulh z3.s, z3.s, z19.s\n"
+ ".inst 0x04b37442 // sqdmulh z2.s, z2.s, z19.s\n"
+ ".inst 0x04b37421 // sqdmulh z1.s, z1.s, z19.s\n"
+ ".inst 0x04b37400 // sqdmulh z0.s, z0.s, z19.s\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
@@ -270,54 +272,52 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smax z15.s, p0/M, z15.s, z18.s\n"
+ "smax z14.s, p0/M, z14.s, z18.s\n"
+ "smax z13.s, p0/M, z13.s, z18.s\n"
+ "smax z12.s, p0/M, z12.s, z18.s\n"
+ "smax z11.s, p0/M, z11.s, z18.s\n"
+ "smax z10.s, p0/M, z10.s, z18.s\n"
+ "smax z9.s, p0/M, z9.s, z18.s\n"
+ "smax z8.s, p0/M, z8.s, z18.s\n"
+ "smax z7.s, p0/M, z7.s, z18.s\n"
+ "smax z6.s, p0/M, z6.s, z18.s\n"
+ "smax z5.s, p0/M, z5.s, z18.s\n"
+ "smax z4.s, p0/M, z4.s, z18.s\n"
+ "smax z3.s, p0/M, z3.s, z18.s\n"
+ "smax z2.s, p0/M, z2.s, z18.s\n"
+ "smax z1.s, p0/M, z1.s, z18.s\n"
+ "smax z0.s, p0/M, z0.s, z18.s\n"
+ "smin z15.s, p0/M, z15.s, z17.s\n"
+ "smin z14.s, p0/M, z14.s, z17.s\n"
+ "smin z13.s, p0/M, z13.s, z17.s\n"
+ "smin z12.s, p0/M, z12.s, z17.s\n"
+ "smin z11.s, p0/M, z11.s, z17.s\n"
+ "smin z10.s, p0/M, z10.s, z17.s\n"
+ "smin z9.s, p0/M, z9.s, z17.s\n"
+ "smin z8.s, p0/M, z8.s, z17.s\n"
+ "smin z7.s, p0/M, z7.s, z17.s\n"
+ "smin z6.s, p0/M, z6.s, z17.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z5.s, p0/M, z5.s, z17.s\n"
+ "smin z4.s, p0/M, z4.s, z17.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p0/M, z11.s, z19.s\n"
- "smin z10.s, p0/M, z10.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z17.s\n"
+ "smin z2.s, p0/M, z2.s, z17.s\n"
"trn1 z22.h, z11.h, z10.h\n"
- "smin z9.s, p0/M, z9.s, z19.s\n"
- "smin z8.s, p0/M, z8.s, z19.s\n"
+ "smin z1.s, p0/M, z1.s, z17.s\n"
+ "smin z0.s, p0/M, z0.s, z17.s\n"
"trn1 z18.h, z9.h, z8.h\n"
- "smin z7.s, p0/M, z7.s, z19.s\n"
- "smin z6.s, p0/M, z6.s, z19.s\n"
"trn1 z21.h, z7.h, z6.h\n"
- "smin z5.s, p0/M, z5.s, z19.s\n"
- "smin z4.s, p0/M, z4.s, z19.s\n"
"trn1 z17.h, z5.h, z4.h\n"
- "smin z3.s, p0/M, z3.s, z19.s\n"
- "smin z2.s, p0/M, z2.s, z19.s\n"
- "trn1 z20.h, z3.h, z2.h\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z19.h, z1.h, z0.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z16.h, z1.h, z0.h\n"
"trn1 z18.b, z22.b, z18.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "incb x27, ALL, MUL #4\n"
"trn1 z17.b, z21.b, z17.b\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z20.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x25]\n"
@@ -348,13 +348,13 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
- ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
"add x22, x22, #0x10\n"
"ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
@@ -368,37 +368,37 @@ void sme_u8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x21, x21, #0x1\n"
"ld1b { z16.b }, p4/Z, [x20, x27]\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
- ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
- ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
- ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
- ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
+ "ld1rw { z19.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "mov z18.s, #0x0\n"
+ "mov z17.s, #0xff\n"
"ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+ ".inst 0x04b375ef // sqdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x04b375ce // sqdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b3758c // sqdmulh z12.s, z12.s, z19.s\n"
".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z17.s\n"
- "smax z14.s, p0/M, z14.s, z17.s\n"
- "smax z13.s, p0/M, z13.s, z17.s\n"
- "smax z12.s, p0/M, z12.s, z17.s\n"
- "smin z15.s, p0/M, z15.s, z16.s\n"
- "smin z14.s, p0/M, z14.s, z16.s\n"
+ "smax z15.s, p0/M, z15.s, z18.s\n"
+ "smax z14.s, p0/M, z14.s, z18.s\n"
+ "smax z13.s, p0/M, z13.s, z18.s\n"
+ "smax z12.s, p0/M, z12.s, z18.s\n"
+ "smin z15.s, p0/M, z15.s, z17.s\n"
+ "smin z14.s, p0/M, z14.s, z17.s\n"
+ "smin z13.s, p0/M, z13.s, z17.s\n"
+ "smin z12.s, p0/M, z12.s, z17.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z16.s\n"
- "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 9088cbde89..393047c8bc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,26 +66,26 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
".inst 0xd503477f // SMSTART ZA\n"
"mov x15, #0x0\n"
- "ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x14, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ptrue p2.b\n"
"ldr x13, [%x[args], %[offsetof_n_channels]]\n"
- "whilelt p0.b, x15, x13\n"
"ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
"ldp x28, x27, [x20, #0x0]\n"
- "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+ "whilelt p0.b, x15, x13\n"
"ldp x26, x25, [x20, #0x10]\n"
- "ld1b { z29.b }, p0/Z, [x25, x15]\n"
"ldp x24, x23, [x20, #0x20]\n"
- "ld1b { z28.b }, p0/Z, [x24, x15]\n"
"ldp x22, x21, [x20, #0x30]\n"
- "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
"ldr x20, [x20, #0x40]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
"ld1b { z26.b }, p0/Z, [x28, x15]\n"
"ld1b { z25.b }, p0/Z, [x26, x15]\n"
"ld1b { z24.b }, p0/Z, [x23, x15]\n"
- "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x15]\n"
"ld1b { z23.b }, p0/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
@@ -98,24 +98,24 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
"movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
"ld1b { z28.b }, p1/Z, [x24, x15]\n"
- "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z20.b\n"
"movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z27.b }, p1/Z, [x21, x15]\n"
"ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p1/Z, [x26, x15]\n"
"movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
"movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
- "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "ld1b { z24.b }, p1/Z, [x23, x15]\n"
"movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
"movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
- "ld1b { z25.b }, p1/Z, [x26, x15]\n"
- "st1b { z19.b }, p0, [x12, x14]\n"
- "ld1b { z24.b }, p1/Z, [x23, x15]\n"
- "st1b { z18.b }, p0, [x11, x14]\n"
- "ld1b { z19.b }, p1/Z, [x22, x15]\n"
- "st1b { z17.b }, p0, [x10, x14]\n"
+ "ld1b { z20.b }, p1/Z, [x22, x15]\n"
"ld1b { z23.b }, p1/Z, [x20, x15]\n"
"incw x15\n"
"whilelt p1.b, x15, x13\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
"incw x14\n"
"b.any 1b\n"
@@ -123,15 +123,15 @@ void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
"movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
"whilelt p0.b, x14, x13\n"
- "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
- "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
- "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
- "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
- "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
- "umax z18.b, p2/M, z18.b, z22.b\n"
- "st1b { z16.b }, p0, [x12, x14]\n"
- "umax z17.b, p2/M, z17.b, z21.b\n"
- "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
"st1b { z18.b }, p0, [x11, x14]\n"
"st1b { z17.b }, p0, [x10, x14]\n"
"st1b { z16.b }, p0, [x9, x14]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
index 06f13e8111..8755113b9a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,21 +53,21 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
"mov z4.b, #0x0\n"
- "mov z3.b, #0x0\n"
"mov x24, %x[inptrs]\n"
+ "mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
- "mov z1.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
@@ -81,34 +81,34 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
+ "movprfx z19, z1\n umax z19.b, p0/M, z19.b, z0.b\n"
+ "umax z23.b, p0/M, z23.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z18, z30\n umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"ld1b { z17.b }, p2/Z, [x23, x27]\n"
"ld1b { z27.b }, p2/Z, [x22, x27]\n"
"ld1b { z21.b }, p2/Z, [x21, x27]\n"
@@ -119,9 +119,9 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z19, z1\n umax z19.b, p0/M, z19.b, z0.b\n"
+ "umax z23.b, p0/M, z23.b, z31.b\n"
+ "movprfx z18, z30\n umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
@@ -131,33 +131,33 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"umax z18.b, p0/M, z18.b, z22.b\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "umax z4.b, p0/M, z4.b, z19.b\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z16.b\n"
- "ld1b { z16.b }, p3/Z, [x20, x28]\n"
- "umax z3.b, p0/M, z3.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z19.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x27]\n"
"ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z5.b }, p4, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
- "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+ "st1b { z4.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
- "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+ "st1b { z3.b }, p2, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
- "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+ "st1b { z2.b }, p1, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"whilelt p1.b, x26, %x[n_channels]\n"
"b.any 1b\n"
@@ -166,48 +166,48 @@ void sme_u8_nhwc_max_generic_depthfirst_impl(
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z4.b, #0x0\n"
+ "mov z5.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x20, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n umax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
"ldp x21, x20, [x24, #0x10]\n"
- "umax z4.b, p0/M, z4.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n umax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z31.b\n"
"umax z16.b, p0/M, z16.b, z17.b\n"
- "umax z4.b, p0/M, z4.b, z16.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z5.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
"whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 52c52ccdb9..d08863105b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -133,11 +133,11 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"1:" // 4-vectors of channels
"ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -170,32 +170,32 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 2b\n"
@@ -229,17 +229,17 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
"subs x21, x21, #0x1\n"
- "ld1b { z16.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
- ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z19.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
"ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa77 // ushllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508ae76 // ushllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508aa55 // ushllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508ae54 // ushllt z20.h, z18.b, #0x0\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
@@ -260,27 +260,29 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
- ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
+ "ld1rw { z21.s }, p0/Z, [%x[left_shift]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
- ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0xff\n"
"ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482826b // srshl z11.s, p0/M, z11.s, z19.s\n"
- ".inst 0x4482826a // srshl z10.s, p0/M, z10.s, z19.s\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x44828269 // srshl z9.s, p0/M, z9.s, z19.s\n"
- ".inst 0x44828268 // srshl z8.s, p0/M, z8.s, z19.s\n"
+ ".inst 0x448282af // srshl z15.s, p0/M, z15.s, z21.s\n"
+ ".inst 0x448282ae // srshl z14.s, p0/M, z14.s, z21.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x44828267 // srshl z7.s, p0/M, z7.s, z19.s\n"
- ".inst 0x44828266 // srshl z6.s, p0/M, z6.s, z19.s\n"
- ".inst 0x44828265 // srshl z5.s, p0/M, z5.s, z19.s\n"
- ".inst 0x44828264 // srshl z4.s, p0/M, z4.s, z19.s\n"
- ".inst 0x44828263 // srshl z3.s, p0/M, z3.s, z19.s\n"
- ".inst 0x44828262 // srshl z2.s, p0/M, z2.s, z19.s\n"
- ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
- ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
+ ".inst 0x448282ad // srshl z13.s, p0/M, z13.s, z21.s\n"
+ ".inst 0x448282ac // srshl z12.s, p0/M, z12.s, z21.s\n"
+ ".inst 0x448282ab // srshl z11.s, p0/M, z11.s, z21.s\n"
+ ".inst 0x448282aa // srshl z10.s, p0/M, z10.s, z21.s\n"
+ ".inst 0x448282a9 // srshl z9.s, p0/M, z9.s, z21.s\n"
+ ".inst 0x448282a8 // srshl z8.s, p0/M, z8.s, z21.s\n"
+ ".inst 0x448282a7 // srshl z7.s, p0/M, z7.s, z21.s\n"
+ ".inst 0x448282a6 // srshl z6.s, p0/M, z6.s, z21.s\n"
+ ".inst 0x448282a5 // srshl z5.s, p0/M, z5.s, z21.s\n"
+ ".inst 0x448282a4 // srshl z4.s, p0/M, z4.s, z21.s\n"
+ ".inst 0x448282a3 // srshl z3.s, p0/M, z3.s, z21.s\n"
+ ".inst 0x448282a2 // srshl z2.s, p0/M, z2.s, z21.s\n"
+ ".inst 0x448282a1 // srshl z1.s, p0/M, z1.s, z21.s\n"
+ ".inst 0x448282a0 // srshl z0.s, p0/M, z0.s, z21.s\n"
".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
@@ -329,54 +331,52 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"add z2.s, z2.s, z16.s\n"
"add z1.s, z1.s, z16.s\n"
"add z0.s, z0.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z19.s\n"
- "smin z14.s, p0/M, z14.s, z19.s\n"
- "trn1 z23.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z19.s\n"
- "smin z12.s, p0/M, z12.s, z19.s\n"
+ "smax z15.s, p0/M, z15.s, z19.s\n"
+ "smax z14.s, p0/M, z14.s, z19.s\n"
+ "smax z13.s, p0/M, z13.s, z19.s\n"
+ "smax z12.s, p0/M, z12.s, z19.s\n"
+ "smax z11.s, p0/M, z11.s, z19.s\n"
+ "smax z10.s, p0/M, z10.s, z19.s\n"
+ "smax z9.s, p0/M, z9.s, z19.s\n"
+ "smax z8.s, p0/M, z8.s, z19.s\n"
+ "smax z7.s, p0/M, z7.s, z19.s\n"
+ "smax z6.s, p0/M, z6.s, z19.s\n"
+ "smax z5.s, p0/M, z5.s, z19.s\n"
+ "smax z4.s, p0/M, z4.s, z19.s\n"
+ "smax z3.s, p0/M, z3.s, z19.s\n"
+ "smax z2.s, p0/M, z2.s, z19.s\n"
+ "smax z1.s, p0/M, z1.s, z19.s\n"
+ "smax z0.s, p0/M, z0.s, z19.s\n"
+ "smin z15.s, p0/M, z15.s, z20.s\n"
+ "smin z14.s, p0/M, z14.s, z20.s\n"
+ "smin z13.s, p0/M, z13.s, z20.s\n"
+ "smin z12.s, p0/M, z12.s, z20.s\n"
+ "smin z11.s, p0/M, z11.s, z20.s\n"
+ "smin z10.s, p0/M, z10.s, z20.s\n"
+ "smin z9.s, p0/M, z9.s, z20.s\n"
+ "smin z8.s, p0/M, z8.s, z20.s\n"
+ "smin z7.s, p0/M, z7.s, z20.s\n"
+ "smin z6.s, p0/M, z6.s, z20.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z5.s, p0/M, z5.s, z20.s\n"
+ "smin z4.s, p0/M, z4.s, z20.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p0/M, z11.s, z19.s\n"
- "smin z10.s, p0/M, z10.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z20.s\n"
+ "smin z2.s, p0/M, z2.s, z20.s\n"
"trn1 z22.h, z11.h, z10.h\n"
- "smin z9.s, p0/M, z9.s, z19.s\n"
- "smin z8.s, p0/M, z8.s, z19.s\n"
+ "smin z1.s, p0/M, z1.s, z20.s\n"
+ "smin z0.s, p0/M, z0.s, z20.s\n"
"trn1 z18.h, z9.h, z8.h\n"
- "smin z7.s, p0/M, z7.s, z19.s\n"
- "smin z6.s, p0/M, z6.s, z19.s\n"
"trn1 z21.h, z7.h, z6.h\n"
- "smin z5.s, p0/M, z5.s, z19.s\n"
- "smin z4.s, p0/M, z4.s, z19.s\n"
"trn1 z17.h, z5.h, z4.h\n"
- "smin z3.s, p0/M, z3.s, z19.s\n"
- "smin z2.s, p0/M, z2.s, z19.s\n"
- "trn1 z20.h, z3.h, z2.h\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z19.h, z1.h, z0.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z16.h, z1.h, z0.h\n"
"trn1 z18.b, z22.b, z18.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "incb x27, ALL, MUL #4\n"
"trn1 z17.b, z21.b, z17.b\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z20.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x25]\n"
@@ -391,10 +391,10 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"8:" // Single vector of channels: Loop
"ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
@@ -407,13 +407,13 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
- ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
- ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
"add x22, x22, #0x10\n"
"ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
@@ -427,29 +427,31 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
+ "subs x21, x21, #0x1\n"
"ld1b { z16.b }, p4/Z, [x20, x27]\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ "ld1rw { z21.s }, p0/Z, [%x[left_shift]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
- ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
+ "mov z20.s, #0x0\n"
+ "mov z19.s, #0xff\n"
+ "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
"ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
- ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
+ ".inst 0x448282af // srshl z15.s, p0/M, z15.s, z21.s\n"
+ ".inst 0x448282ae // srshl z14.s, p0/M, z14.s, z21.s\n"
"ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x448282ad // srshl z13.s, p0/M, z13.s, z21.s\n"
+ ".inst 0x448282ac // srshl z12.s, p0/M, z12.s, z21.s\n"
+ ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
@@ -458,17 +460,15 @@ void sme_u8q_nhwc_avg_generic_depthfirst_impl(
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z17.s\n"
- "smax z14.s, p0/M, z14.s, z17.s\n"
- "smax z13.s, p0/M, z13.s, z17.s\n"
- "smax z12.s, p0/M, z12.s, z17.s\n"
- "smin z15.s, p0/M, z15.s, z16.s\n"
- "smin z14.s, p0/M, z14.s, z16.s\n"
+ "smax z15.s, p0/M, z15.s, z20.s\n"
+ "smax z14.s, p0/M, z14.s, z20.s\n"
+ "smax z13.s, p0/M, z13.s, z20.s\n"
+ "smax z12.s, p0/M, z12.s, z20.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z16.s\n"
- "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x27]\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
index c8e8e7d399..5632c96834 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,20 +56,20 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov z3.b, #0x0\n"
+ "mov z4.b, #0x0\n"
"mov x24, %x[inptrs]\n"
+ "mov z3.b, #0x0\n"
"mov z2.b, #0x0\n"
- "mov z1.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
@@ -83,34 +83,34 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
+ "movprfx z19, z1\n umax z19.b, p0/M, z19.b, z0.b\n"
+ "umax z23.b, p0/M, z23.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z18, z30\n umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
"umax z16.b, p0/M, z16.b, z25.b\n"
"umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
"umax z19.b, p0/M, z19.b, z23.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"umax z18.b, p0/M, z18.b, z22.b\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
"umax z17.b, p0/M, z17.b, z21.b\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z30.b }, p3/Z, [x23, x28]\n"
"umax z5.b, p0/M, z5.b, z19.b\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "ld1b { z18.b }, p3/Z, [x23, x28]\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
"ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
"ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
"ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"ld1b { z17.b }, p2/Z, [x23, x27]\n"
"ld1b { z27.b }, p2/Z, [x22, x27]\n"
"ld1b { z21.b }, p2/Z, [x21, x27]\n"
@@ -121,9 +121,9 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
- "umax z23.b, p0/M, z23.b, z30.b\n"
- "umax z18.b, p0/M, z18.b, z29.b\n"
+ "movprfx z19, z1\n umax z19.b, p0/M, z19.b, z0.b\n"
+ "umax z23.b, p0/M, z23.b, z31.b\n"
+ "movprfx z18, z30\n umax z18.b, p0/M, z18.b, z29.b\n"
"umax z22.b, p0/M, z22.b, z28.b\n"
"umax z17.b, p0/M, z17.b, z27.b\n"
"umax z21.b, p0/M, z21.b, z26.b\n"
@@ -134,172 +134,172 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"umax z17.b, p0/M, z17.b, z21.b\n"
"umax z16.b, p0/M, z16.b, z20.b\n"
"umax z5.b, p0/M, z5.b, z19.b\n"
- "umax z3.b, p0/M, z3.b, z18.b\n"
- "umax z2.b, p0/M, z2.b, z17.b\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
- "ld1b { z16.b }, p3/Z, [x20, x28]\n"
- "umax z3.b, p0/M, z3.b, z16.b\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z19.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x27]\n"
"ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "umax z1.b, p0/M, z1.b, z16.b\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
+ "umax z4.b, p0/M, z4.b, z18.b\n"
+ "umax z3.b, p0/M, z3.b, z17.b\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a8b7 // ushllb z23.h, z5.b, #0x0\n"
- ".inst 0x4508acb9 // ushllt z25.h, z5.b, #0x0\n"
- ".inst 0x4508a876 // ushllb z22.h, z3.b, #0x0\n"
- ".inst 0x4508ac72 // ushllt z18.h, z3.b, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4508a855 // ushllb z21.h, z2.b, #0x0\n"
- ".inst 0x4508ac51 // ushllt z17.h, z2.b, #0x0\n"
+ ".inst 0x4508a8b3 // ushllb z19.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z6.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a895 // ushllb z21.h, z4.b, #0x0\n"
+ ".inst 0x4508ac92 // ushllt z18.h, z4.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x4508a834 // ushllb z20.h, z1.b, #0x0\n"
- ".inst 0x4508ac38 // ushllt z24.h, z1.b, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z19.s }, p0/Z, [x20]\n"
- "neg z4.s, p0/M, z4.s\n"
- ".inst 0x45974081 // saddwb z1.s, z4.s, z23.h\n"
+ ".inst 0x4508a874 // ushllb z20.h, z3.b, #0x0\n"
+ ".inst 0x4508ac71 // ushllt z17.h, z3.b, #0x0\n"
+ "ld1rw { z5.s }, p0/Z, [x21]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ ".inst 0x4508a858 // ushllb z24.h, z2.b, #0x0\n"
+ ".inst 0x4508ac57 // ushllt z23.h, z2.b, #0x0\n"
+ "ld1rw { z4.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x45974497 // saddwt z23.s, z4.s, z23.h\n"
- ".inst 0x45994080 // saddwb z0.s, z4.s, z25.h\n"
- ".inst 0x4599449f // saddwt z31.s, z4.s, z25.h\n"
- ".inst 0x4596409e // saddwb z30.s, z4.s, z22.h\n"
- ".inst 0x45964496 // saddwt z22.s, z4.s, z22.h\n"
- ".inst 0x4592409d // saddwb z29.s, z4.s, z18.h\n"
- ".inst 0x45924492 // saddwt z18.s, z4.s, z18.h\n"
- ".inst 0x4595409c // saddwb z28.s, z4.s, z21.h\n"
- ".inst 0x45954495 // saddwt z21.s, z4.s, z21.h\n"
- ".inst 0x4591409b // saddwb z27.s, z4.s, z17.h\n"
- ".inst 0x45914491 // saddwt z17.s, z4.s, z17.h\n"
- ".inst 0x4594409a // saddwb z26.s, z4.s, z20.h\n"
- ".inst 0x45944494 // saddwt z20.s, z4.s, z20.h\n"
- ".inst 0x45984099 // saddwb z25.s, z4.s, z24.h\n"
- ".inst 0x45984498 // saddwt z24.s, z4.s, z24.h\n"
- ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
- ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
- ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
+ "neg z6.s, p0/M, z6.s\n"
+ "ld1rw { z3.s }, p0/Z, [x21]\n"
+ "mov z2.s, #0x0\n"
+ "ld1rw { z1.s }, p0/Z, [x20]\n"
+ "mov z0.s, #0xff\n"
+ ".inst 0x459340df // saddwb z31.s, z6.s, z19.h\n"
+ ".inst 0x459344d3 // saddwt z19.s, z6.s, z19.h\n"
+ ".inst 0x459040de // saddwb z30.s, z6.s, z16.h\n"
+ ".inst 0x459044d0 // saddwt z16.s, z6.s, z16.h\n"
+ ".inst 0x459540dd // saddwb z29.s, z6.s, z21.h\n"
+ ".inst 0x459544d6 // saddwt z22.s, z6.s, z21.h\n"
+ ".inst 0x459240dc // saddwb z28.s, z6.s, z18.h\n"
+ ".inst 0x459244d2 // saddwt z18.s, z6.s, z18.h\n"
+ ".inst 0x459440db // saddwb z27.s, z6.s, z20.h\n"
+ ".inst 0x459444d5 // saddwt z21.s, z6.s, z20.h\n"
+ ".inst 0x459140d4 // saddwb z20.s, z6.s, z17.h\n"
+ ".inst 0x459144d1 // saddwt z17.s, z6.s, z17.h\n"
+ ".inst 0x459840da // saddwb z26.s, z6.s, z24.h\n"
+ ".inst 0x459844d9 // saddwt z25.s, z6.s, z24.h\n"
+ ".inst 0x459740d8 // saddwb z24.s, z6.s, z23.h\n"
+ ".inst 0x459744d7 // saddwt z23.s, z6.s, z23.h\n"
+ ".inst 0x448280bf // srshl z31.s, p0/M, z31.s, z5.s\n"
+ ".inst 0x448280b3 // srshl z19.s, p0/M, z19.s, z5.s\n"
+ ".inst 0x448280be // srshl z30.s, p0/M, z30.s, z5.s\n"
+ ".inst 0x448280b0 // srshl z16.s, p0/M, z16.s, z5.s\n"
+ ".inst 0x448280bd // srshl z29.s, p0/M, z29.s, z5.s\n"
+ ".inst 0x448280b6 // srshl z22.s, p0/M, z22.s, z5.s\n"
+ ".inst 0x448280bc // srshl z28.s, p0/M, z28.s, z5.s\n"
+ ".inst 0x448280b2 // srshl z18.s, p0/M, z18.s, z5.s\n"
+ ".inst 0x448280bb // srshl z27.s, p0/M, z27.s, z5.s\n"
+ ".inst 0x448280b5 // srshl z21.s, p0/M, z21.s, z5.s\n"
+ ".inst 0x448280b4 // srshl z20.s, p0/M, z20.s, z5.s\n"
+ ".inst 0x448280b1 // srshl z17.s, p0/M, z17.s, z5.s\n"
+ ".inst 0x448280ba // srshl z26.s, p0/M, z26.s, z5.s\n"
+ ".inst 0x448280b9 // srshl z25.s, p0/M, z25.s, z5.s\n"
+ ".inst 0x448280b8 // srshl z24.s, p0/M, z24.s, z5.s\n"
+ ".inst 0x448280b7 // srshl z23.s, p0/M, z23.s, z5.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
+ ".inst 0x44828073 // srshl z19.s, p0/M, z19.s, z3.s\n"
".inst 0x4482807e // srshl z30.s, p0/M, z30.s, z3.s\n"
- ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
+ ".inst 0x44828070 // srshl z16.s, p0/M, z16.s, z3.s\n"
".inst 0x4482807d // srshl z29.s, p0/M, z29.s, z3.s\n"
- ".inst 0x44828072 // srshl z18.s, p0/M, z18.s, z3.s\n"
+ ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
".inst 0x4482807c // srshl z28.s, p0/M, z28.s, z3.s\n"
- ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
+ ".inst 0x44828072 // srshl z18.s, p0/M, z18.s, z3.s\n"
".inst 0x4482807b // srshl z27.s, p0/M, z27.s, z3.s\n"
+ ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
+ ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
".inst 0x44828071 // srshl z17.s, p0/M, z17.s, z3.s\n"
".inst 0x4482807a // srshl z26.s, p0/M, z26.s, z3.s\n"
- ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
".inst 0x44828079 // srshl z25.s, p0/M, z25.s, z3.s\n"
".inst 0x44828078 // srshl z24.s, p0/M, z24.s, z3.s\n"
- ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
- ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
- ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
- ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
- ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
- ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
- ".inst 0x04a277bd // sqrdmulh z29.s, z29.s, z2.s\n"
- ".inst 0x04a27652 // sqrdmulh z18.s, z18.s, z2.s\n"
- ".inst 0x04a2779c // sqrdmulh z28.s, z28.s, z2.s\n"
- ".inst 0x04a276b5 // sqrdmulh z21.s, z21.s, z2.s\n"
- ".inst 0x04a2777b // sqrdmulh z27.s, z27.s, z2.s\n"
- ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
- ".inst 0x04a2775a // sqrdmulh z26.s, z26.s, z2.s\n"
- ".inst 0x04a27694 // sqrdmulh z20.s, z20.s, z2.s\n"
- ".inst 0x04a27739 // sqrdmulh z25.s, z25.s, z2.s\n"
- ".inst 0x04a27718 // sqrdmulh z24.s, z24.s, z2.s\n"
- ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
- ".inst 0x44828277 // srshl z23.s, p0/M, z23.s, z19.s\n"
- ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
- ".inst 0x4482827f // srshl z31.s, p0/M, z31.s, z19.s\n"
- ".inst 0x4482827e // srshl z30.s, p0/M, z30.s, z19.s\n"
- ".inst 0x44828276 // srshl z22.s, p0/M, z22.s, z19.s\n"
- ".inst 0x4482827d // srshl z29.s, p0/M, z29.s, z19.s\n"
- ".inst 0x44828272 // srshl z18.s, p0/M, z18.s, z19.s\n"
- ".inst 0x4482827c // srshl z28.s, p0/M, z28.s, z19.s\n"
- ".inst 0x44828275 // srshl z21.s, p0/M, z21.s, z19.s\n"
- ".inst 0x4482827b // srshl z27.s, p0/M, z27.s, z19.s\n"
- ".inst 0x44828271 // srshl z17.s, p0/M, z17.s, z19.s\n"
- ".inst 0x4482827a // srshl z26.s, p0/M, z26.s, z19.s\n"
- ".inst 0x44828274 // srshl z20.s, p0/M, z20.s, z19.s\n"
- ".inst 0x44828279 // srshl z25.s, p0/M, z25.s, z19.s\n"
- ".inst 0x44828278 // srshl z24.s, p0/M, z24.s, z19.s\n"
- "add z1.s, z1.s, z16.s\n"
- "add z23.s, z23.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "add z30.s, z30.s, z16.s\n"
- "add z22.s, z22.s, z16.s\n"
- "add z29.s, z29.s, z16.s\n"
- "add z18.s, z18.s, z16.s\n"
- "add z28.s, z28.s, z16.s\n"
- "add z21.s, z21.s, z16.s\n"
- "add z27.s, z27.s, z16.s\n"
- "add z17.s, z17.s, z16.s\n"
- "add z26.s, z26.s, z16.s\n"
- "add z20.s, z20.s, z16.s\n"
- "add z25.s, z25.s, z16.s\n"
- "add z24.s, z24.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "mov z19.s, #0xff\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smax z30.s, p0/M, z30.s, z16.s\n"
- "smax z22.s, p0/M, z22.s, z16.s\n"
- "smax z29.s, p0/M, z29.s, z16.s\n"
- "smax z18.s, p0/M, z18.s, z16.s\n"
- "smax z28.s, p0/M, z28.s, z16.s\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
- "smax z27.s, p0/M, z27.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z26.s, p0/M, z26.s, z16.s\n"
- "smax z20.s, p0/M, z20.s, z16.s\n"
- "smax z25.s, p0/M, z25.s, z16.s\n"
- "smax z24.s, p0/M, z24.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z19.s\n"
- "smin z23.s, p0/M, z23.s, z19.s\n"
- "smin z0.s, p0/M, z0.s, z19.s\n"
- "trn1 z23.h, z1.h, z23.h\n"
- "smin z31.s, p0/M, z31.s, z19.s\n"
- "smin z30.s, p0/M, z30.s, z19.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "smin z22.s, p0/M, z22.s, z19.s\n"
- "smin z29.s, p0/M, z29.s, z19.s\n"
- "trn1 z22.h, z30.h, z22.h\n"
- "smin z18.s, p0/M, z18.s, z19.s\n"
- "smin z28.s, p0/M, z28.s, z19.s\n"
- "trn1 z18.h, z29.h, z18.h\n"
- "smin z21.s, p0/M, z21.s, z19.s\n"
- "smin z27.s, p0/M, z27.s, z19.s\n"
- "trn1 z21.h, z28.h, z21.h\n"
- "smin z17.s, p0/M, z17.s, z19.s\n"
- "smin z26.s, p0/M, z26.s, z19.s\n"
- "trn1 z17.h, z27.h, z17.h\n"
- "smin z20.s, p0/M, z20.s, z19.s\n"
- "smin z25.s, p0/M, z25.s, z19.s\n"
- "trn1 z20.h, z26.h, z20.h\n"
- "smin z24.s, p0/M, z24.s, z19.s\n"
- "trn1 z19.h, z25.h, z24.h\n"
- "trn1 z16.b, z23.b, z16.b\n"
+ ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
+ "add z31.s, z31.s, z1.s\n"
+ "add z19.s, z19.s, z1.s\n"
+ "add z30.s, z30.s, z1.s\n"
+ "add z16.s, z16.s, z1.s\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z22.s, z22.s, z1.s\n"
+ "add z28.s, z28.s, z1.s\n"
+ "add z18.s, z18.s, z1.s\n"
+ "add z27.s, z27.s, z1.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z20.s, z20.s, z1.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z26.s, z26.s, z1.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z24.s, z24.s, z1.s\n"
+ "add z23.s, z23.s, z1.s\n"
+ "smax z31.s, p0/M, z31.s, z2.s\n"
+ "smax z19.s, p0/M, z19.s, z2.s\n"
+ "smax z30.s, p0/M, z30.s, z2.s\n"
+ "smax z16.s, p0/M, z16.s, z2.s\n"
+ "smax z29.s, p0/M, z29.s, z2.s\n"
+ "smax z22.s, p0/M, z22.s, z2.s\n"
+ "smax z28.s, p0/M, z28.s, z2.s\n"
+ "smax z18.s, p0/M, z18.s, z2.s\n"
+ "smax z27.s, p0/M, z27.s, z2.s\n"
+ "smax z21.s, p0/M, z21.s, z2.s\n"
+ "smax z20.s, p0/M, z20.s, z2.s\n"
+ "smax z17.s, p0/M, z17.s, z2.s\n"
+ "smax z26.s, p0/M, z26.s, z2.s\n"
+ "smax z25.s, p0/M, z25.s, z2.s\n"
+ "smax z24.s, p0/M, z24.s, z2.s\n"
+ "smax z23.s, p0/M, z23.s, z2.s\n"
+ "smin z31.s, p0/M, z31.s, z0.s\n"
+ "smin z19.s, p0/M, z19.s, z0.s\n"
+ "smin z30.s, p0/M, z30.s, z0.s\n"
+ "smin z16.s, p0/M, z16.s, z0.s\n"
+ "smin z29.s, p0/M, z29.s, z0.s\n"
+ "smin z22.s, p0/M, z22.s, z0.s\n"
+ "smin z28.s, p0/M, z28.s, z0.s\n"
+ "smin z18.s, p0/M, z18.s, z0.s\n"
+ "smin z27.s, p0/M, z27.s, z0.s\n"
+ "smin z21.s, p0/M, z21.s, z0.s\n"
+ "trn1 z19.h, z31.h, z19.h\n"
+ "smin z20.s, p0/M, z20.s, z0.s\n"
+ "smin z17.s, p0/M, z17.s, z0.s\n"
+ "trn1 z16.h, z30.h, z16.h\n"
+ "smin z26.s, p0/M, z26.s, z0.s\n"
+ "smin z25.s, p0/M, z25.s, z0.s\n"
+ "trn1 z22.h, z29.h, z22.h\n"
+ "smin z24.s, p0/M, z24.s, z0.s\n"
+ "smin z23.s, p0/M, z23.s, z0.s\n"
+ "trn1 z18.h, z28.h, z18.h\n"
+ "trn1 z21.h, z27.h, z21.h\n"
+ "trn1 z17.h, z20.h, z17.h\n"
+ "trn1 z20.b, z19.b, z16.b\n"
+ "trn1 z19.h, z26.h, z25.h\n"
+ "trn1 z16.h, z24.h, z23.h\n"
"trn1 z18.b, z22.b, z18.b\n"
"trn1 z17.b, z21.b, z17.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z20.b }, p4, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
- "trn1 z16.b, z20.b, z19.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
"st1b { z18.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
"st1b { z17.b }, p2, [%x[outptr], x27]\n"
@@ -316,32 +316,32 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"mov z5.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
- "ldp x20, x22, [x24, #0x0]\n"
+ "ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "ld1b { z0.b }, p4/Z, [x20, x9]\n"
"ldp x21, x20, [x24, #0x10]\n"
"add x24, x24, #0x20\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n umax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z31.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"subs x25, x25, #0x1\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
"ldp x21, x20, [x24, #0x10]\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z0.b }, p4/Z, [x23, x9]\n"
- "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x23, x9]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ld1b { z0.b }, p4/Z, [x22, x9]\n"
"ld1b { z23.b }, p4/Z, [x21, x9]\n"
- "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x20, x9]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
- "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "movprfx z16, z1\n umax z16.b, p0/M, z16.b, z0.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z31.b\n"
"umax z16.b, p0/M, z16.b, z17.b\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
@@ -349,56 +349,56 @@ void sme_u8q_nhwc_max_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z18.s }, p0/Z, [x20]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
".inst 0x4508a8b1 // ushllb z17.h, z5.b, #0x0\n"
- ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
- "neg z18.s, p0/M, z18.s\n"
- ".inst 0x45914257 // saddwb z23.s, z18.s, z17.h\n"
+ ".inst 0x4508acba // ushllt z26.h, z5.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z22.s }, p0/Z, [x20]\n"
- ".inst 0x45914655 // saddwt z21.s, z18.s, z17.h\n"
- ".inst 0x45904254 // saddwb z20.s, z18.s, z16.h\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z19.s }, p0/Z, [x20]\n"
- ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
- ".inst 0x448282d7 // srshl z23.s, p0/M, z23.s, z22.s\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x448282d5 // srshl z21.s, p0/M, z21.s, z22.s\n"
- ".inst 0x448282d4 // srshl z20.s, p0/M, z20.s, z22.s\n"
+ "ld1rw { z16.s }, p0/Z, [x21]\n"
+ "add x22, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "mov z25.s, #0x0\n"
+ "ld1rw { z24.s }, p0/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x448282d2 // srshl z18.s, p0/M, z18.s, z22.s\n"
- ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
- ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
- ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
- ".inst 0x04b37652 // sqrdmulh z18.s, z18.s, z19.s\n"
- ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
- ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
- ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
- ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
- "add z23.s, z23.s, z16.s\n"
- "add z21.s, z21.s, z16.s\n"
- "add z20.s, z20.s, z16.s\n"
- "add z18.s, z18.s, z16.s\n"
- "mov z17.s, #0x0\n"
- "mov z16.s, #0xff\n"
- "smax z23.s, p0/M, z23.s, z17.s\n"
- "smax z21.s, p0/M, z21.s, z17.s\n"
- "smax z20.s, p0/M, z20.s, z17.s\n"
- "smax z18.s, p0/M, z18.s, z17.s\n"
- "smin z23.s, p0/M, z23.s, z16.s\n"
- "smin z21.s, p0/M, z21.s, z16.s\n"
- "smin z20.s, p0/M, z20.s, z16.s\n"
- "trn1 z17.h, z23.h, z21.h\n"
- "smin z18.s, p0/M, z18.s, z16.s\n"
- "trn1 z16.h, z20.h, z18.h\n"
+ "mov z23.s, #0xff\n"
+ "ld1rw { z22.s }, p0/Z, [x22]\n"
+ "neg z16.s, p0/M, z16.s\n"
+ "ld1rw { z21.s }, p0/Z, [x21]\n"
+ "ld1rw { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x45914213 // saddwb z19.s, z16.s, z17.h\n"
+ ".inst 0x45914611 // saddwt z17.s, z16.s, z17.h\n"
+ ".inst 0x459a4212 // saddwb z18.s, z16.s, z26.h\n"
+ ".inst 0x459a4610 // saddwt z16.s, z16.s, z26.h\n"
+ ".inst 0x44828313 // srshl z19.s, p0/M, z19.s, z24.s\n"
+ ".inst 0x44828311 // srshl z17.s, p0/M, z17.s, z24.s\n"
+ ".inst 0x44828312 // srshl z18.s, p0/M, z18.s, z24.s\n"
+ ".inst 0x44828310 // srshl z16.s, p0/M, z16.s, z24.s\n"
+ ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n"
+ ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n"
+ ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n"
+ ".inst 0x04b67610 // sqrdmulh z16.s, z16.s, z22.s\n"
+ ".inst 0x448282b3 // srshl z19.s, p0/M, z19.s, z21.s\n"
+ ".inst 0x448282b1 // srshl z17.s, p0/M, z17.s, z21.s\n"
+ ".inst 0x448282b2 // srshl z18.s, p0/M, z18.s, z21.s\n"
+ ".inst 0x448282b0 // srshl z16.s, p0/M, z16.s, z21.s\n"
+ "add z19.s, z19.s, z20.s\n"
+ "add z17.s, z17.s, z20.s\n"
+ "add z18.s, z18.s, z20.s\n"
+ "add z16.s, z16.s, z20.s\n"
+ "smax z19.s, p0/M, z19.s, z25.s\n"
+ "smax z17.s, p0/M, z17.s, z25.s\n"
+ "smax z18.s, p0/M, z18.s, z25.s\n"
+ "smax z16.s, p0/M, z16.s, z25.s\n"
+ "smin z19.s, p0/M, z19.s, z23.s\n"
+ "smin z17.s, p0/M, z17.s, z23.s\n"
+ "smin z18.s, p0/M, z18.s, z23.s\n"
+ "smin z16.s, p0/M, z16.s, z23.s\n"
+ "trn1 z17.h, z19.h, z17.h\n"
+ "trn1 z16.h, z18.h, z16.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p4, [%x[outptr], x9]\n"
"incb x9\n"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 1ba78f3fba..02b165da73 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,13 +87,13 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x3, #0x0\n"
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x5, x6, [x21, #0x0]\n"
- "whilelt p2.h, XZR, x20\n"
+ "add x5, %x[args], %[offsetof_rescale]\n"
+ "mov x6, #0x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "whilelt p1.h, XZR, x20\n"
"whilelt p0.h, x3, x2\n"
- "ldp x7, x8, [x21, #0x10]\n"
- "ldp x17, x16, [x4, #0x0]\n"
- "add x15, %x[args], %[offsetof_rescale]\n"
- "mov x14, #0x0\n"
+ "ldp x15, x14, [x4, #0x0]\n"
"ldp x13, x12, [x4, #0x10]\n"
"ldp x11, x10, [x4, #0x20]\n"
"ldp x9, x28, [x4, #0x30]\n"
@@ -101,103 +101,103 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
- "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
- "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
- "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
- "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
- "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
- "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
- "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
- "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
- "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
- "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
- "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
- "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+ "ld1h { z8.h }, p0/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z7.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z6.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z5.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z4.h }, p0/Z, [x14, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
+ "ld1rqh { z0.h }, p1/Z, [x5]\n"
"whilelt p1.h, x3, x2\n"
- "ld1rqh { z0.h }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.h, z7.h, z6.h\n"
- "fadd z16.h, z5.h, z4.h\n"
- "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
- "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
- "fadd z19.h, z17.h, z16.h\n"
- "fadd z18.h, z3.h, z2.h\n"
- "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
- "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
- "fadd z17.h, z1.h, z31.h\n"
- "fadd z22.h, z30.h, z29.h\n"
- "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
- "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
- "fadd z16.h, z28.h, z27.h\n"
- "fadd z21.h, z18.h, z19.h\n"
- "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
- "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
- "fadd z20.h, z16.h, z19.h\n"
- "fadd z19.h, z26.h, z17.h\n"
- "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
- "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
- "fadd z18.h, z25.h, z22.h\n"
- "fadd z17.h, z24.h, z17.h\n"
- "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
- "fadd z16.h, z23.h, z22.h\n"
- "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
- "fadd z19.h, z21.h, z19.h\n"
- "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "fadd z19.h, z8.h, z7.h\n"
+ "fadd z16.h, z6.h, z5.h\n"
+ "ld1h { z8.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z7.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "fadd z23.h, z4.h, z3.h\n"
+ "fadd z18.h, z2.h, z1.h\n"
+ "ld1h { z6.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z5.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "fadd z17.h, z31.h, z30.h\n"
+ "fadd z22.h, z29.h, z28.h\n"
+ "ld1h { z4.h }, p1/Z, [x14, x3, LSL #1]\n"
+ "ld1h { z3.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "fadd z16.h, z19.h, z16.h\n"
+ "ld1h { z2.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z1.h }, p1/Z, [x27, x3, LSL #1]\n"
+ "whilelt p0.h, x6, x2\n"
+ "fadd z19.h, z27.h, z18.h\n"
+ "fadd z21.h, z25.h, z18.h\n"
+ "ld1h { z31.h }, p1/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z30.h }, p1/Z, [x24, x3, LSL #1]\n"
+ "fadd z18.h, z26.h, z17.h\n"
+ "fadd z20.h, z24.h, z17.h\n"
+ "ld1h { z29.h }, p1/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z28.h }, p1/Z, [x21, x3, LSL #1]\n"
+ "fadd z17.h, z23.h, z16.h\n"
+ "fadd z16.h, z22.h, z16.h\n"
+ "ld1h { z27.h }, p1/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x3, LSL #1]\n"
"incw x3\n"
- "fadd z18.h, z21.h, z18.h\n"
- "fadd z17.h, z17.h, z20.h\n"
- "fadd z16.h, z16.h, z20.h\n"
- "whilelt p0.h, x14, x2\n"
+ "fadd z19.h, z17.h, z19.h\n"
+ "fadd z18.h, z17.h, z18.h\n"
+ "fadd z17.h, z21.h, z16.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
"whilelt p1.h, x3, x2\n"
"fmul z19.h, z19.h, z0.h[0]\n"
"fmul z18.h, z18.h, z0.h[1]\n"
- "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
"fmul z17.h, z17.h, z0.h[2]\n"
"fmul z16.h, z16.h, z0.h[3]\n"
- "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
- "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
- "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
- "incw x14\n"
+ "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+ "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+ "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+ "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+ "incw x6\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.h, z7.h, z6.h\n"
- "fadd z16.h, z5.h, z4.h\n"
- "whilelt p0.h, x14, x2\n"
- "fadd z20.h, z17.h, z16.h\n"
- "fadd z18.h, z3.h, z2.h\n"
- "fadd z17.h, z1.h, z31.h\n"
- "fadd z19.h, z30.h, z29.h\n"
- "fadd z16.h, z28.h, z27.h\n"
- "fadd z21.h, z18.h, z20.h\n"
- "fadd z20.h, z16.h, z20.h\n"
- "fadd z16.h, z26.h, z17.h\n"
- "fadd z18.h, z25.h, z19.h\n"
- "fadd z17.h, z24.h, z17.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z16.h, z21.h, z16.h\n"
- "fmul z16.h, z16.h, z0.h[0]\n"
- "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
- "fadd z18.h, z21.h, z18.h\n"
- "fadd z17.h, z17.h, z20.h\n"
+ "fadd z19.h, z8.h, z7.h\n"
+ "fadd z16.h, z6.h, z5.h\n"
+ "whilelt p0.h, x6, x2\n"
+ "fadd z23.h, z4.h, z3.h\n"
+ "fadd z18.h, z2.h, z1.h\n"
+ "fadd z17.h, z31.h, z30.h\n"
+ "fadd z22.h, z29.h, z28.h\n"
+ "fadd z16.h, z19.h, z16.h\n"
+ "fadd z19.h, z27.h, z18.h\n"
+ "fadd z21.h, z25.h, z18.h\n"
+ "fadd z18.h, z26.h, z17.h\n"
+ "fadd z20.h, z24.h, z17.h\n"
+ "fadd z17.h, z23.h, z16.h\n"
+ "fadd z16.h, z22.h, z16.h\n"
+ "fadd z19.h, z17.h, z19.h\n"
+ "fadd z18.h, z17.h, z18.h\n"
+ "fadd z17.h, z21.h, z16.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
+ "fmul z19.h, z19.h, z0.h[0]\n"
"fmul z18.h, z18.h, z0.h[1]\n"
"fmul z17.h, z17.h, z0.h[2]\n"
- "fadd z16.h, z19.h, z20.h\n"
"fmul z16.h, z16.h, z0.h[3]\n"
- "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
- "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
- "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
+ "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+ "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+ "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+ "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 2bef44ea5c..942240d816 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,12 +46,12 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"cnth x28\n"
"cnth x27, ALL, MUL #2\n"
"cnth x26, ALL, MUL #3\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
"whilelt p3.h, x9, %x[n_channels]\n"
- "ld1rh { z7.h }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p2.h, x28, %x[n_channels]\n"
"whilelt p1.h, x27, %x[n_channels]\n"
"whilelt p0.h, x26, %x[n_channels]\n"
+ "ld1rh { z7.h }, p4/Z, [%x[rescale_ptr]]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
@@ -93,17 +93,17 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
"fadd z18.h, z22.h, z18.h\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fadd z17.h, z21.h, z17.h\n"
"ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
"ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
- "fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
"ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
"ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
@@ -142,30 +142,30 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.h, z6.h, z16.h\n"
- "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
- "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
- "fadd z5.h, z5.h, z17.h\n"
- "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z19.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
"ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z6.h, z6.h, z19.h\n"
+ "fadd z5.h, z5.h, z18.h\n"
+ "fadd z4.h, z4.h, z17.h\n"
"fadd z3.h, z3.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.h, z6.h, z7.h\n"
"fmul z5.h, z5.h, z7.h\n"
- "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
"fmul z4.h, z4.h, z7.h\n"
"fmul z3.h, z3.h, z7.h\n"
- "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
- "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
+ "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
"inch x9, ALL, MUL #4\n"
+ "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
"inch x28, ALL, MUL #4\n"
+ "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
+ "inch x27, ALL, MUL #4\n"
"st1h { z3.h }, p0, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
"whilelt p0.h, x26, %x[n_channels]\n"
- "inch x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
"whilelt p3.h, x9, %x[n_channels]\n"
@@ -189,14 +189,14 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"fadd z16.h, z0.h, z31.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fadd z16.h, z17.h, z16.h\n"
"subs x25, x25, #0x1\n"
- "fadd z6.h, z6.h, z16.h\n"
"add x24, x24, #0x20\n"
+ "fadd z16.h, z17.h, z16.h\n"
"ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
"ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
"ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fadd z6.h, z6.h, z16.h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
"fadd z17.h, z2.h, z1.h\n"
@@ -208,8 +208,8 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z6.h, z6.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
@@ -221,7 +221,7 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 31bbfd085e..eef19e9993 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,22 +66,22 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p0.h, x14, x15\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "mov x11, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
+ "whilelt p0.h, x14, x15\n"
"ldp x28, x27, [x20, #0x0]\n"
"ldp x26, x25, [x20, #0x10]\n"
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
"ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
- "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x24, x14, LSL #1]\n"
"ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
- "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x14, LSL #1]\n"
"ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
"ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
"ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
@@ -90,50 +90,50 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p1.h, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z29.h\n"
+ "movprfx z21, z29\n fmax z21.h, p2/M, z21.h, z27.h\n"
"ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
- "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
- "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
- "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
- "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z29.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z30.h\n"
+ "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
+ "ld1h { z27.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z30.h }, p1/Z, [x28, x14, LSL #1]\n"
+ "movprfx z16, z28\n fmax z16.h, p2/M, z16.h, z24.h\n"
+ "movprfx z20, z25\n fmax z20.h, p2/M, z20.h, z23.h\n"
"ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
"ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
"ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
"ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
- "whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "whilelt p0.h, x13, x15\n"
"ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
"incw x14\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
"whilelt p1.h, x14, x15\n"
- "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
- "fmax z17.h, p2/M, z17.h, z21.h\n"
- "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
- "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
- "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
- "incw x11\n"
+ "st1h { z19.h }, p0, [x12, x13, LSL #1]\n"
+ "st1h { z18.h }, p0, [x11, x13, LSL #1]\n"
+ "st1h { z17.h }, p0, [x10, x13, LSL #1]\n"
+ "st1h { z16.h }, p0, [x9, x13, LSL #1]\n"
+ "incw x13\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
- "movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
- "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
- "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
- "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
- "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
- "whilelt p0.h, x11, x15\n"
- "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
- "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
- "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
- "fmax z17.h, p2/M, z17.h, z21.h\n"
- "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
- "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
- "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
- "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
+ "movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z29.h\n"
+ "movprfx z21, z29\n fmax z21.h, p2/M, z21.h, z27.h\n"
+ "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z30.h\n"
+ "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
+ "movprfx z16, z28\n fmax z16.h, p2/M, z16.h, z24.h\n"
+ "movprfx z20, z25\n fmax z20.h, p2/M, z20.h, z23.h\n"
+ "whilelt p0.h, x13, x15\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "st1h { z19.h }, p0, [x12, x13, LSL #1]\n"
+ "st1h { z18.h }, p0, [x11, x13, LSL #1]\n"
+ "st1h { z17.h }, p0, [x10, x13, LSL #1]\n"
+ "st1h { z16.h }, p0, [x9, x13, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 1a01412836..31c4f48b96 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,176 +44,176 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
"cnth x28\n"
"cnth x27, ALL, MUL #2\n"
"cnth x26, ALL, MUL #3\n"
- "whilelt p4.h, x9, %x[n_channels]\n"
- "whilelt p3.h, x28, %x[n_channels]\n"
- "whilelt p2.h, x27, %x[n_channels]\n"
- "whilelt p1.h, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
+ "whilelt p2.h, x28, %x[n_channels]\n"
+ "whilelt p1.h, x27, %x[n_channels]\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.h, #0xfc00\n"
- "mov z7.h, #0xfc00\n"
- "mov x24, %x[inptrs]\n"
"mov z6.h, #0xfc00\n"
"mov z5.h, #0xfc00\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z4.h, #0xfc00\n"
+ "mov z3.h, #0xfc00\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z31.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z28.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+ "movprfx z19, z2\n fmax z19.h, p4/M, z19.h, z1.h\n"
+ "fmax z23.h, p4/M, z23.h, z0.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
- "fmax z22.h, p0/M, z22.h, z30.h\n"
- "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
- "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
- "fmax z21.h, p0/M, z21.h, z27.h\n"
- "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
- "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
- "fmax z20.h, p0/M, z20.h, z24.h\n"
- "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z18.h, p0/M, z18.h, z22.h\n"
- "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
- "fmax z17.h, p0/M, z17.h, z21.h\n"
- "fmax z16.h, p0/M, z16.h, z20.h\n"
- "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
+ "fmax z22.h, p4/M, z22.h, z29.h\n"
+ "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
+ "fmax z21.h, p4/M, z21.h, z26.h\n"
+ "fmax z16.h, p4/M, z16.h, z25.h\n"
+ "fmax z20.h, p4/M, z20.h, z24.h\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "fmax z19.h, p4/M, z19.h, z23.h\n"
+ "fmax z18.h, p4/M, z18.h, z22.h\n"
+ "ld1h { z23.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fmax z17.h, p4/M, z17.h, z21.h\n"
"subs x25, x25, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
- "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
- "fmax z7.h, p0/M, z7.h, z18.h\n"
- "fmax z6.h, p0/M, z6.h, z17.h\n"
- "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
- "fmax z5.h, p0/M, z5.h, z16.h\n"
+ "ld1h { z31.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "fmax z16.h, p4/M, z16.h, z20.h\n"
"add x24, x24, #0x20\n"
- "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fmax z6.h, p4/M, z6.h, z19.h\n"
+ "fmax z5.h, p4/M, z5.h, z18.h\n"
+ "ld1h { z28.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "fmax z4.h, p4/M, z4.h, z17.h\n"
+ "ld1h { z21.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fmax z3.h, p4/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
- "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
- "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
- "fmax z22.h, p0/M, z22.h, z30.h\n"
- "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
- "fmax z21.h, p0/M, z21.h, z27.h\n"
- "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
- "fmax z20.h, p0/M, z20.h, z24.h\n"
- "fmax z19.h, p0/M, z19.h, z23.h\n"
- "fmax z18.h, p0/M, z18.h, z22.h\n"
- "fmax z17.h, p0/M, z17.h, z21.h\n"
- "fmax z16.h, p0/M, z16.h, z20.h\n"
- "fmax z8.h, p0/M, z8.h, z19.h\n"
- "fmax z7.h, p0/M, z7.h, z18.h\n"
- "fmax z6.h, p0/M, z6.h, z17.h\n"
- "fmax z5.h, p0/M, z5.h, z16.h\n"
+ "movprfx z19, z2\n fmax z19.h, p4/M, z19.h, z1.h\n"
+ "fmax z23.h, p4/M, z23.h, z0.h\n"
+ "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
+ "fmax z22.h, p4/M, z22.h, z29.h\n"
+ "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
+ "fmax z21.h, p4/M, z21.h, z26.h\n"
+ "fmax z16.h, p4/M, z16.h, z25.h\n"
+ "fmax z20.h, p4/M, z20.h, z24.h\n"
+ "fmax z19.h, p4/M, z19.h, z23.h\n"
+ "fmax z18.h, p4/M, z18.h, z22.h\n"
+ "fmax z17.h, p4/M, z17.h, z21.h\n"
+ "fmax z16.h, p4/M, z16.h, z20.h\n"
+ "fmax z6.h, p4/M, z6.h, z19.h\n"
+ "fmax z5.h, p4/M, z5.h, z18.h\n"
+ "fmax z4.h, p4/M, z4.h, z17.h\n"
+ "fmax z3.h, p4/M, z3.h, z16.h\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z16.h\n"
- "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
- "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
- "fmax z7.h, p0/M, z7.h, z17.h\n"
- "fmax z6.h, p0/M, z6.h, z16.h\n"
- "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
- "fmax z5.h, p0/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fmax z6.h, p4/M, z6.h, z19.h\n"
+ "fmax z5.h, p4/M, z5.h, z18.h\n"
+ "fmax z4.h, p4/M, z4.h, z17.h\n"
+ "fmax z3.h, p4/M, z3.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
"inch x9, ALL, MUL #4\n"
- "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
+ "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
"inch x28, ALL, MUL #4\n"
- "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
+ "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
"inch x27, ALL, MUL #4\n"
- "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
+ "st1h { z3.h }, p0, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
- "whilelt p1.h, x26, %x[n_channels]\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.h, x9, %x[n_channels]\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.h, #0xfc00\n"
+ "mov z6.h, #0xfc00\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
- "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
- "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "movprfx z16, z2\n fmax z16.h, p4/M, z16.h, z1.h\n"
+ "movprfx z17, z23\n fmax z17.h, p4/M, z17.h, z0.h\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fmax z16.h, p0/M, z16.h, z17.h\n"
"subs x25, x25, #0x1\n"
- "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
- "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
- "fmax z8.h, p0/M, z8.h, z16.h\n"
"add x24, x24, #0x20\n"
- "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
- "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "fmax z16.h, p4/M, z16.h, z17.h\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fmax z6.h, p4/M, z6.h, z16.h\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
- "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
- "fmax z16.h, p0/M, z16.h, z17.h\n"
- "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "movprfx z16, z2\n fmax z16.h, p4/M, z16.h, z1.h\n"
+ "movprfx z17, z23\n fmax z17.h, p4/M, z17.h, z0.h\n"
+ "fmax z16.h, p4/M, z16.h, z17.h\n"
+ "fmax z6.h, p4/M, z6.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fmax z6.h, p4/M, z6.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
"inch x9\n"
- "whilelt p4.h, x9, %x[n_channels]\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index c5ea5adea0..059c0468df 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -87,13 +87,13 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"mov x3, #0x0\n"
"mov x20, #0x4\n"
"ldr x4, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x5, x6, [x21, #0x0]\n"
- "whilelt p2.s, XZR, x20\n"
+ "add x5, %x[args], %[offsetof_rescale]\n"
+ "mov x6, #0x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "whilelt p1.s, XZR, x20\n"
"whilelt p0.s, x3, x2\n"
- "ldp x7, x8, [x21, #0x10]\n"
- "ldp x17, x16, [x4, #0x0]\n"
- "add x15, %x[args], %[offsetof_rescale]\n"
- "mov x14, #0x0\n"
+ "ldp x15, x14, [x4, #0x0]\n"
"ldp x13, x12, [x4, #0x10]\n"
"ldp x11, x10, [x4, #0x20]\n"
"ldp x9, x28, [x4, #0x30]\n"
@@ -101,103 +101,103 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
"ldp x25, x24, [x4, #0x50]\n"
"ldp x23, x22, [x4, #0x60]\n"
"ldp x21, x20, [x4, #0x70]\n"
- "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
- "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
- "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
- "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
- "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
- "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
- "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
- "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
- "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
- "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
- "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
- "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
- "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+ "ld1w { z8.s }, p0/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z7.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z6.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z5.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z4.s }, p0/Z, [x14, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
+ "ld1rqw { z0.s }, p1/Z, [x5]\n"
"whilelt p1.s, x3, x2\n"
- "ld1rqw { z0.s }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.s, z7.s, z6.s\n"
- "fadd z16.s, z5.s, z4.s\n"
- "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
- "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
- "fadd z19.s, z17.s, z16.s\n"
- "fadd z18.s, z3.s, z2.s\n"
- "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
- "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
- "fadd z17.s, z1.s, z31.s\n"
- "fadd z22.s, z30.s, z29.s\n"
- "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
- "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
- "fadd z16.s, z28.s, z27.s\n"
- "fadd z21.s, z18.s, z19.s\n"
- "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
- "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
- "fadd z20.s, z16.s, z19.s\n"
- "fadd z19.s, z26.s, z17.s\n"
- "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
- "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
- "fadd z18.s, z25.s, z22.s\n"
- "fadd z17.s, z24.s, z17.s\n"
- "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
- "fadd z16.s, z23.s, z22.s\n"
- "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
- "fadd z19.s, z21.s, z19.s\n"
- "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "fadd z19.s, z8.s, z7.s\n"
+ "fadd z16.s, z6.s, z5.s\n"
+ "ld1w { z8.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z7.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "fadd z23.s, z4.s, z3.s\n"
+ "fadd z18.s, z2.s, z1.s\n"
+ "ld1w { z6.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z5.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "fadd z17.s, z31.s, z30.s\n"
+ "fadd z22.s, z29.s, z28.s\n"
+ "ld1w { z4.s }, p1/Z, [x14, x3, LSL #2]\n"
+ "ld1w { z3.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "fadd z16.s, z19.s, z16.s\n"
+ "ld1w { z2.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z1.s }, p1/Z, [x27, x3, LSL #2]\n"
+ "whilelt p0.s, x6, x2\n"
+ "fadd z19.s, z27.s, z18.s\n"
+ "fadd z21.s, z25.s, z18.s\n"
+ "ld1w { z31.s }, p1/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x24, x3, LSL #2]\n"
+ "fadd z18.s, z26.s, z17.s\n"
+ "fadd z20.s, z24.s, z17.s\n"
+ "ld1w { z29.s }, p1/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x21, x3, LSL #2]\n"
+ "fadd z17.s, z23.s, z16.s\n"
+ "fadd z16.s, z22.s, z16.s\n"
+ "ld1w { z27.s }, p1/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x3, LSL #2]\n"
"incw x3\n"
- "fadd z18.s, z21.s, z18.s\n"
- "fadd z17.s, z17.s, z20.s\n"
- "fadd z16.s, z16.s, z20.s\n"
- "whilelt p0.s, x14, x2\n"
+ "fadd z19.s, z17.s, z19.s\n"
+ "fadd z18.s, z17.s, z18.s\n"
+ "fadd z17.s, z21.s, z16.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
"whilelt p1.s, x3, x2\n"
"fmul z19.s, z19.s, z0.s[0]\n"
"fmul z18.s, z18.s, z0.s[1]\n"
- "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
"fmul z17.s, z17.s, z0.s[2]\n"
"fmul z16.s, z16.s, z0.s[3]\n"
- "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
- "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
- "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
- "incw x14\n"
+ "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+ "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+ "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+ "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+ "incw x6\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.s, z7.s, z6.s\n"
- "fadd z16.s, z5.s, z4.s\n"
- "whilelt p0.s, x14, x2\n"
- "fadd z20.s, z17.s, z16.s\n"
- "fadd z18.s, z3.s, z2.s\n"
- "fadd z17.s, z1.s, z31.s\n"
- "fadd z19.s, z30.s, z29.s\n"
- "fadd z16.s, z28.s, z27.s\n"
- "fadd z21.s, z18.s, z20.s\n"
- "fadd z20.s, z16.s, z20.s\n"
- "fadd z16.s, z26.s, z17.s\n"
- "fadd z18.s, z25.s, z19.s\n"
- "fadd z17.s, z24.s, z17.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z16.s, z21.s, z16.s\n"
- "fmul z16.s, z16.s, z0.s[0]\n"
- "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
- "fadd z18.s, z21.s, z18.s\n"
- "fadd z17.s, z17.s, z20.s\n"
+ "fadd z19.s, z8.s, z7.s\n"
+ "fadd z16.s, z6.s, z5.s\n"
+ "whilelt p0.s, x6, x2\n"
+ "fadd z23.s, z4.s, z3.s\n"
+ "fadd z18.s, z2.s, z1.s\n"
+ "fadd z17.s, z31.s, z30.s\n"
+ "fadd z22.s, z29.s, z28.s\n"
+ "fadd z16.s, z19.s, z16.s\n"
+ "fadd z19.s, z27.s, z18.s\n"
+ "fadd z21.s, z25.s, z18.s\n"
+ "fadd z18.s, z26.s, z17.s\n"
+ "fadd z20.s, z24.s, z17.s\n"
+ "fadd z17.s, z23.s, z16.s\n"
+ "fadd z16.s, z22.s, z16.s\n"
+ "fadd z19.s, z17.s, z19.s\n"
+ "fadd z18.s, z17.s, z18.s\n"
+ "fadd z17.s, z21.s, z16.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
+ "fmul z19.s, z19.s, z0.s[0]\n"
"fmul z18.s, z18.s, z0.s[1]\n"
"fmul z17.s, z17.s, z0.s[2]\n"
- "fadd z16.s, z19.s, z20.s\n"
"fmul z16.s, z16.s, z0.s[3]\n"
- "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
- "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
- "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
+ "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+ "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+ "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+ "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 7c94894892..4fd624ca9d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,12 +46,12 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"cntw x28\n"
"cntw x27, ALL, MUL #2\n"
"cntw x26, ALL, MUL #3\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
"whilelt p3.s, x9, %x[n_channels]\n"
- "ld1rw { z7.s }, p0/Z, [%x[rescale_ptr]]\n"
"whilelt p2.s, x28, %x[n_channels]\n"
"whilelt p1.s, x27, %x[n_channels]\n"
"whilelt p0.s, x26, %x[n_channels]\n"
+ "ld1rw { z7.s }, p4/Z, [%x[rescale_ptr]]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
@@ -93,17 +93,17 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
"fadd z18.s, z22.s, z18.s\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fadd z17.s, z21.s, z17.s\n"
"ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
"ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
- "fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
"ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
"ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
@@ -142,30 +142,30 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fadd z6.s, z6.s, z16.s\n"
- "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
- "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
- "fadd z5.s, z5.s, z17.s\n"
- "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z19.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
"ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z6.s, z6.s, z19.s\n"
+ "fadd z5.s, z5.s, z18.s\n"
+ "fadd z4.s, z4.s, z17.s\n"
"fadd z3.s, z3.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"fmul z6.s, z6.s, z7.s\n"
"fmul z5.s, z5.s, z7.s\n"
- "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
"fmul z4.s, z4.s, z7.s\n"
"fmul z3.s, z3.s, z7.s\n"
- "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
- "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
+ "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
"incw x9, ALL, MUL #4\n"
+ "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
"incw x28, ALL, MUL #4\n"
+ "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
+ "incw x27, ALL, MUL #4\n"
"st1w { z3.s }, p0, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
"whilelt p0.s, x26, %x[n_channels]\n"
- "incw x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
"whilelt p3.s, x9, %x[n_channels]\n"
@@ -189,14 +189,14 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"fadd z16.s, z0.s, z31.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fadd z16.s, z17.s, z16.s\n"
"subs x25, x25, #0x1\n"
- "fadd z6.s, z6.s, z16.s\n"
"add x24, x24, #0x20\n"
+ "fadd z16.s, z17.s, z16.s\n"
"ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
"ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
"ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fadd z6.s, z6.s, z16.s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
"fadd z17.s, z2.s, z1.s\n"
@@ -208,8 +208,8 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z6.s, z6.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
@@ -221,7 +221,7 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index d9cebd1363..dcd182fa97 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,22 +66,22 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p0.s, x14, x15\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "mov x11, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
+ "whilelt p0.s, x14, x15\n"
"ldp x28, x27, [x20, #0x0]\n"
"ldp x26, x25, [x20, #0x10]\n"
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
"ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
- "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x24, x14, LSL #2]\n"
"ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
- "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x14, LSL #2]\n"
"ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
"ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
"ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
@@ -90,50 +90,50 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p1.s, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z29.s\n"
+ "movprfx z21, z29\n fmax z21.s, p2/M, z21.s, z27.s\n"
"ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
- "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
- "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
- "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z29.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z30.s\n"
+ "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
+ "ld1w { z27.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x28, x14, LSL #2]\n"
+ "movprfx z16, z28\n fmax z16.s, p2/M, z16.s, z24.s\n"
+ "movprfx z20, z25\n fmax z20.s, p2/M, z20.s, z23.s\n"
"ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
"ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
"ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
"ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
- "whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "whilelt p0.s, x13, x15\n"
"ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
"incw x14\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
"whilelt p1.s, x14, x15\n"
- "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
- "fmax z17.s, p2/M, z17.s, z21.s\n"
- "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
- "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
- "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
- "incw x11\n"
+ "st1w { z19.s }, p0, [x12, x13, LSL #2]\n"
+ "st1w { z18.s }, p0, [x11, x13, LSL #2]\n"
+ "st1w { z17.s }, p0, [x10, x13, LSL #2]\n"
+ "st1w { z16.s }, p0, [x9, x13, LSL #2]\n"
+ "incw x13\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
- "movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
- "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
- "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
- "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
- "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
- "whilelt p0.s, x11, x15\n"
- "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
- "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
- "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
- "fmax z17.s, p2/M, z17.s, z21.s\n"
- "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
- "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
- "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
- "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
+ "movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z29.s\n"
+ "movprfx z21, z29\n fmax z21.s, p2/M, z21.s, z27.s\n"
+ "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z30.s\n"
+ "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
+ "movprfx z16, z28\n fmax z16.s, p2/M, z16.s, z24.s\n"
+ "movprfx z20, z25\n fmax z20.s, p2/M, z20.s, z23.s\n"
+ "whilelt p0.s, x13, x15\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "st1w { z19.s }, p0, [x12, x13, LSL #2]\n"
+ "st1w { z18.s }, p0, [x11, x13, LSL #2]\n"
+ "st1w { z17.s }, p0, [x10, x13, LSL #2]\n"
+ "st1w { z16.s }, p0, [x9, x13, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 87fc75adda..132c8bd8db 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,176 +44,176 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
"cntw x28\n"
"cntw x27, ALL, MUL #2\n"
"cntw x26, ALL, MUL #3\n"
- "whilelt p4.s, x9, %x[n_channels]\n"
- "whilelt p3.s, x28, %x[n_channels]\n"
- "whilelt p2.s, x27, %x[n_channels]\n"
- "whilelt p1.s, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
+ "whilelt p2.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x27, %x[n_channels]\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.s, #0xff800000\n"
- "mov z7.s, #0xff800000\n"
- "mov x24, %x[inptrs]\n"
"mov z6.s, #0xff800000\n"
"mov z5.s, #0xff800000\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z4.s, #0xff800000\n"
+ "mov z3.s, #0xff800000\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z31.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z28.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+ "movprfx z19, z2\n fmax z19.s, p4/M, z19.s, z1.s\n"
+ "fmax z23.s, p4/M, z23.s, z0.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
- "fmax z22.s, p0/M, z22.s, z30.s\n"
- "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
- "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
- "fmax z21.s, p0/M, z21.s, z27.s\n"
- "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
- "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
- "fmax z20.s, p0/M, z20.s, z24.s\n"
- "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z18.s, p0/M, z18.s, z22.s\n"
- "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
- "fmax z17.s, p0/M, z17.s, z21.s\n"
- "fmax z16.s, p0/M, z16.s, z20.s\n"
- "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
+ "fmax z22.s, p4/M, z22.s, z29.s\n"
+ "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
+ "fmax z21.s, p4/M, z21.s, z26.s\n"
+ "fmax z16.s, p4/M, z16.s, z25.s\n"
+ "fmax z20.s, p4/M, z20.s, z24.s\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "fmax z19.s, p4/M, z19.s, z23.s\n"
+ "fmax z18.s, p4/M, z18.s, z22.s\n"
+ "ld1w { z23.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fmax z17.s, p4/M, z17.s, z21.s\n"
"subs x25, x25, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
- "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
- "fmax z7.s, p0/M, z7.s, z18.s\n"
- "fmax z6.s, p0/M, z6.s, z17.s\n"
- "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
- "fmax z5.s, p0/M, z5.s, z16.s\n"
+ "ld1w { z31.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmax z16.s, p4/M, z16.s, z20.s\n"
"add x24, x24, #0x20\n"
- "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmax z6.s, p4/M, z6.s, z19.s\n"
+ "fmax z5.s, p4/M, z5.s, z18.s\n"
+ "ld1w { z28.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "fmax z4.s, p4/M, z4.s, z17.s\n"
+ "ld1w { z21.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fmax z3.s, p4/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
- "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
- "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
- "fmax z22.s, p0/M, z22.s, z30.s\n"
- "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
- "fmax z21.s, p0/M, z21.s, z27.s\n"
- "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
- "fmax z20.s, p0/M, z20.s, z24.s\n"
- "fmax z19.s, p0/M, z19.s, z23.s\n"
- "fmax z18.s, p0/M, z18.s, z22.s\n"
- "fmax z17.s, p0/M, z17.s, z21.s\n"
- "fmax z16.s, p0/M, z16.s, z20.s\n"
- "fmax z8.s, p0/M, z8.s, z19.s\n"
- "fmax z7.s, p0/M, z7.s, z18.s\n"
- "fmax z6.s, p0/M, z6.s, z17.s\n"
- "fmax z5.s, p0/M, z5.s, z16.s\n"
+ "movprfx z19, z2\n fmax z19.s, p4/M, z19.s, z1.s\n"
+ "fmax z23.s, p4/M, z23.s, z0.s\n"
+ "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
+ "fmax z22.s, p4/M, z22.s, z29.s\n"
+ "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
+ "fmax z21.s, p4/M, z21.s, z26.s\n"
+ "fmax z16.s, p4/M, z16.s, z25.s\n"
+ "fmax z20.s, p4/M, z20.s, z24.s\n"
+ "fmax z19.s, p4/M, z19.s, z23.s\n"
+ "fmax z18.s, p4/M, z18.s, z22.s\n"
+ "fmax z17.s, p4/M, z17.s, z21.s\n"
+ "fmax z16.s, p4/M, z16.s, z20.s\n"
+ "fmax z6.s, p4/M, z6.s, z19.s\n"
+ "fmax z5.s, p4/M, z5.s, z18.s\n"
+ "fmax z4.s, p4/M, z4.s, z17.s\n"
+ "fmax z3.s, p4/M, z3.s, z16.s\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z16.s\n"
- "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
- "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
- "fmax z7.s, p0/M, z7.s, z17.s\n"
- "fmax z6.s, p0/M, z6.s, z16.s\n"
- "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
- "fmax z5.s, p0/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fmax z6.s, p4/M, z6.s, z19.s\n"
+ "fmax z5.s, p4/M, z5.s, z18.s\n"
+ "fmax z4.s, p4/M, z4.s, z17.s\n"
+ "fmax z3.s, p4/M, z3.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
"incw x9, ALL, MUL #4\n"
- "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
+ "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
"incw x28, ALL, MUL #4\n"
- "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
+ "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
"incw x27, ALL, MUL #4\n"
- "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
+ "st1w { z3.s }, p0, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
- "whilelt p1.s, x26, %x[n_channels]\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.s, x9, %x[n_channels]\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.s, #0xff800000\n"
+ "mov z6.s, #0xff800000\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
- "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
- "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "movprfx z16, z2\n fmax z16.s, p4/M, z16.s, z1.s\n"
+ "movprfx z17, z23\n fmax z17.s, p4/M, z17.s, z0.s\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "fmax z16.s, p0/M, z16.s, z17.s\n"
"subs x25, x25, #0x1\n"
- "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
- "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
- "fmax z8.s, p0/M, z8.s, z16.s\n"
"add x24, x24, #0x20\n"
- "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
- "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "fmax z16.s, p4/M, z16.s, z17.s\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fmax z6.s, p4/M, z6.s, z16.s\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
- "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
- "fmax z16.s, p0/M, z16.s, z17.s\n"
- "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "movprfx z16, z2\n fmax z16.s, p4/M, z16.s, z1.s\n"
+ "movprfx z17, z23\n fmax z17.s, p4/M, z17.s, z0.s\n"
+ "fmax z16.s, p4/M, z16.s, z17.s\n"
+ "fmax z6.s, p4/M, z6.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
"subs x21, x21, #0x1\n"
- "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fmax z6.s, p4/M, z6.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
"incw x9\n"
- "whilelt p4.s, x9, %x[n_channels]\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 7925905e64..d59765af0a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,11 +99,11 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"cntb x26\n"
"cntb x25, ALL, MUL #2\n"
"cntb x24, ALL, MUL #3\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
- "whilelt p2.b, x25, %x[n_channels]\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
+ "whilelt p2.b, x26, %x[n_channels]\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -128,14 +128,14 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
@@ -145,24 +145,24 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
@@ -204,17 +204,17 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- "ld1b { z17.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z19.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x24]\n"
+ ".inst 0x4508a277 // sshllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508a676 // sshllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508a255 // sshllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508a654 // sshllt z20.h, z18.b, #0x0\n"
".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
@@ -235,98 +235,98 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
- ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
- ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
- ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
- ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
- ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
- ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
- ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
- ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
- ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
- ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
- ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
- ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
- ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
- ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
- ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
- ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
- ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
- ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
- ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
- ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
- ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "trn1 z17.h, z15.h, z14.h\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
- "smin z11.s, p0/M, z11.s, z18.s\n"
- "trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z10.s, p0/M, z10.s, z18.s\n"
- "smin z9.s, p0/M, z9.s, z18.s\n"
- "trn1 z17.h, z11.h, z10.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "smin z8.s, p0/M, z8.s, z18.s\n"
- "smin z7.s, p0/M, z7.s, z18.s\n"
+ "ld1rw { z18.s }, p4/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z17.s }, p4/Z, [%x[shift_ptr]]\n"
+ "mov z20.s, #0x7f\n"
+ "not z16.s, p4/M, z20.s\n"
+ ".inst 0x04b275ef // sqdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b2756b // sqdmulh z11.s, z11.s, z18.s\n"
+ ".inst 0x04b2754a // sqdmulh z10.s, z10.s, z18.s\n"
+ ".inst 0x04b27529 // sqdmulh z9.s, z9.s, z18.s\n"
+ ".inst 0x04b27508 // sqdmulh z8.s, z8.s, z18.s\n"
+ ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
+ ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
+ ".inst 0x04b274e7 // sqdmulh z7.s, z7.s, z18.s\n"
+ ".inst 0x04b274c6 // sqdmulh z6.s, z6.s, z18.s\n"
+ ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ ".inst 0x04b274a5 // sqdmulh z5.s, z5.s, z18.s\n"
+ ".inst 0x04b27484 // sqdmulh z4.s, z4.s, z18.s\n"
+ ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
+ ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
+ ".inst 0x04b27463 // sqdmulh z3.s, z3.s, z18.s\n"
+ ".inst 0x04b27442 // sqdmulh z2.s, z2.s, z18.s\n"
+ ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
+ ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
+ ".inst 0x04b27421 // sqdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27400 // sqdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
+ ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
+ ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
+ ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
+ ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
+ ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
+ ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
+ ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
+ "smax z15.s, p4/M, z15.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z16.s\n"
+ "smax z13.s, p4/M, z13.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z16.s\n"
+ "smax z11.s, p4/M, z11.s, z16.s\n"
+ "smax z10.s, p4/M, z10.s, z16.s\n"
+ "smax z9.s, p4/M, z9.s, z16.s\n"
+ "smax z8.s, p4/M, z8.s, z16.s\n"
+ "smax z7.s, p4/M, z7.s, z16.s\n"
+ "smax z6.s, p4/M, z6.s, z16.s\n"
+ "smax z5.s, p4/M, z5.s, z16.s\n"
+ "smax z4.s, p4/M, z4.s, z16.s\n"
+ "smax z3.s, p4/M, z3.s, z16.s\n"
+ "smax z2.s, p4/M, z2.s, z16.s\n"
+ "smax z1.s, p4/M, z1.s, z16.s\n"
+ "smax z0.s, p4/M, z0.s, z16.s\n"
+ "smin z15.s, p4/M, z15.s, z20.s\n"
+ "smin z14.s, p4/M, z14.s, z20.s\n"
+ "smin z13.s, p4/M, z13.s, z20.s\n"
+ "smin z12.s, p4/M, z12.s, z20.s\n"
+ "smin z11.s, p4/M, z11.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z20.s\n"
+ "smin z9.s, p4/M, z9.s, z20.s\n"
+ "smin z8.s, p4/M, z8.s, z20.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z7.s, p4/M, z7.s, z20.s\n"
+ "smin z6.s, p4/M, z6.s, z20.s\n"
+ "trn1 z17.h, z13.h, z12.h\n"
+ "smin z5.s, p4/M, z5.s, z20.s\n"
+ "smin z4.s, p4/M, z4.s, z20.s\n"
+ "trn1 z18.h, z11.h, z10.h\n"
+ "smin z3.s, p4/M, z3.s, z20.s\n"
+ "smin z2.s, p4/M, z2.s, z20.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z6.s, p0/M, z6.s, z18.s\n"
- "smin z5.s, p0/M, z5.s, z18.s\n"
- "trn1 z17.h, z7.h, z6.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z4.s, p0/M, z4.s, z18.s\n"
- "smin z3.s, p0/M, z3.s, z18.s\n"
- "trn1 z16.h, z5.h, z4.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z3.h, z2.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
+ "smin z1.s, p4/M, z1.s, z20.s\n"
+ "smin z0.s, p4/M, z0.s, z20.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
"trn1 z16.h, z1.h, z0.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "incb x24, ALL, MUL #4\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
+ "st1b { z20.b }, p3, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x25]\n"
"incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -339,21 +339,21 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
- "add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
@@ -367,42 +367,42 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p3/Z, [x20, x27]\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
+ "ld1rw { z19.s }, p4/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z18.s }, p4/Z, [%x[shift_ptr]]\n"
+ "mov z17.s, #0x7f\n"
+ "not z16.s, p4/M, z17.s\n"
+ ".inst 0x04b375ef // sqdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x04b375ce // sqdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b3758c // sqdmulh z12.s, z12.s, z19.s\n"
+ ".inst 0x4482924f // srshl z15.s, p4/M, z15.s, z18.s\n"
+ ".inst 0x4482924e // srshl z14.s, p4/M, z14.s, z18.s\n"
+ ".inst 0x4482924d // srshl z13.s, p4/M, z13.s, z18.s\n"
+ ".inst 0x4482924c // srshl z12.s, p4/M, z12.s, z18.s\n"
+ "smax z15.s, p4/M, z15.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z16.s\n"
+ "smax z13.s, p4/M, z13.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z16.s\n"
+ "smin z15.s, p4/M, z15.s, z17.s\n"
+ "smin z14.s, p4/M, z14.s, z17.s\n"
+ "smin z13.s, p4/M, z13.s, z17.s\n"
+ "smin z12.s, p4/M, z12.s, z17.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x27]\n"
"incb x27\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 5681cc1f3d..6e9422025c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,22 +66,22 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p0.b, x14, x15\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "mov x11, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
+ "whilelt p0.b, x14, x15\n"
"ldp x28, x27, [x20, #0x0]\n"
"ldp x26, x25, [x20, #0x10]\n"
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
"ld1b { z31.b }, p0/Z, [x27, x14]\n"
- "ld1b { z30.b }, p0/Z, [x24, x14]\n"
- "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
"ld1b { z28.b }, p0/Z, [x25, x14]\n"
- "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x14]\n"
"ld1b { z26.b }, p0/Z, [x26, x14]\n"
"ld1b { z25.b }, p0/Z, [x23, x14]\n"
"ld1b { z24.b }, p0/Z, [x22, x14]\n"
@@ -90,50 +90,50 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z29.b\n"
+ "movprfx z21, z29\n smax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z31.b }, p1/Z, [x27, x14]\n"
- "ld1b { z30.b }, p1/Z, [x24, x14]\n"
- "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p1/Z, [x21, x14]\n"
- "ld1b { z27.b }, p1/Z, [x28, x14]\n"
- "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z29.b }, p1/Z, [x24, x14]\n"
+ "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z30.b\n"
+ "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
+ "ld1b { z27.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x28, x14]\n"
+ "movprfx z16, z28\n smax z16.b, p2/M, z16.b, z24.b\n"
+ "movprfx z20, z25\n smax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z28.b }, p1/Z, [x25, x14]\n"
"ld1b { z26.b }, p1/Z, [x26, x14]\n"
"ld1b { z25.b }, p1/Z, [x23, x14]\n"
"ld1b { z24.b }, p1/Z, [x22, x14]\n"
- "whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "whilelt p0.b, x13, x15\n"
"ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
"whilelt p1.b, x14, x15\n"
- "st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
- "smax z17.b, p2/M, z17.b, z21.b\n"
- "st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
- "st1b { z17.b }, p0, [x10, x11]\n"
- "st1b { z16.b }, p0, [x9, x11]\n"
- "incw x11\n"
+ "st1b { z19.b }, p0, [x12, x13]\n"
+ "st1b { z18.b }, p0, [x11, x13]\n"
+ "st1b { z17.b }, p0, [x10, x13]\n"
+ "st1b { z16.b }, p0, [x9, x13]\n"
+ "incw x13\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
- "movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
- "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
- "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
- "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
- "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
- "whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
- "smax z17.b, p2/M, z17.b, z21.b\n"
- "st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
- "st1b { z17.b }, p0, [x10, x11]\n"
- "st1b { z16.b }, p0, [x9, x11]\n"
+ "movprfx z22, z31\n smax z22.b, p2/M, z22.b, z29.b\n"
+ "movprfx z21, z29\n smax z21.b, p2/M, z21.b, z27.b\n"
+ "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z30.b\n"
+ "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
+ "movprfx z16, z28\n smax z16.b, p2/M, z16.b, z24.b\n"
+ "movprfx z20, z25\n smax z20.b, p2/M, z20.b, z23.b\n"
+ "whilelt p0.b, x13, x15\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z19.b }, p0, [x12, x13]\n"
+ "st1b { z18.b }, p0, [x11, x13]\n"
+ "st1b { z17.b }, p0, [x10, x13]\n"
+ "st1b { z16.b }, p0, [x9, x13]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index da9e1408f9..0d9f607066 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,176 +44,176 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
"cntb x28\n"
"cntb x27, ALL, MUL #2\n"
"cntb x26, ALL, MUL #3\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
- "whilelt p2.b, x27, %x[n_channels]\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
+ "whilelt p2.b, x28, %x[n_channels]\n"
+ "whilelt p1.b, x27, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x80\n"
- "mov z7.b, #0x80\n"
- "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z4.b, #0x80\n"
+ "mov z3.b, #0x80\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z19, z2\n smax z19.b, p4/M, z19.b, z1.b\n"
+ "smax z23.b, p4/M, z23.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
- "smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
- "smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
- "smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "smax z17.b, p0/M, z17.b, z21.b\n"
- "smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
+ "smax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
+ "smax z21.b, p4/M, z21.b, z26.b\n"
+ "smax z16.b, p4/M, z16.b, z25.b\n"
+ "smax z20.b, p4/M, z20.b, z24.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "smax z19.b, p4/M, z19.b, z23.b\n"
+ "smax z18.b, p4/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "smax z17.b, p4/M, z17.b, z21.b\n"
"subs x25, x25, #0x1\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "smax z7.b, p0/M, z7.b, z18.b\n"
- "smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "smax z16.b, p4/M, z16.b, z20.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z5.b, p4/M, z5.b, z18.b\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "smax z4.b, p4/M, z4.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "smax z3.b, p4/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
- "smax z22.b, p0/M, z22.b, z30.b\n"
- "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
- "smax z21.b, p0/M, z21.b, z27.b\n"
- "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
- "smax z20.b, p0/M, z20.b, z24.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z18.b, p0/M, z18.b, z22.b\n"
- "smax z17.b, p0/M, z17.b, z21.b\n"
- "smax z16.b, p0/M, z16.b, z20.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "smax z7.b, p0/M, z7.b, z18.b\n"
- "smax z6.b, p0/M, z6.b, z17.b\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "movprfx z19, z2\n smax z19.b, p4/M, z19.b, z1.b\n"
+ "smax z23.b, p4/M, z23.b, z0.b\n"
+ "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
+ "smax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
+ "smax z21.b, p4/M, z21.b, z26.b\n"
+ "smax z16.b, p4/M, z16.b, z25.b\n"
+ "smax z20.b, p4/M, z20.b, z24.b\n"
+ "smax z19.b, p4/M, z19.b, z23.b\n"
+ "smax z18.b, p4/M, z18.b, z22.b\n"
+ "smax z17.b, p4/M, z17.b, z21.b\n"
+ "smax z16.b, p4/M, z16.b, z20.b\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z5.b, p4/M, z5.b, z18.b\n"
+ "smax z4.b, p4/M, z4.b, z17.b\n"
+ "smax z3.b, p4/M, z3.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
- "ld1b { z17.b }, p3/Z, [x20, x28]\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "smax z7.b, p0/M, z7.b, z17.b\n"
- "smax z6.b, p0/M, z6.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z19.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x26]\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z5.b, p4/M, z5.b, z18.b\n"
+ "smax z4.b, p4/M, z4.b, z17.b\n"
+ "smax z3.b, p4/M, z3.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z6.b }, p3, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
- "st1b { z7.b }, p3, [%x[outptr], x28]\n"
+ "st1b { z5.b }, p2, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
- "st1b { z6.b }, p2, [%x[outptr], x27]\n"
+ "st1b { z4.b }, p1, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
- "st1b { z5.b }, p1, [%x[outptr], x26]\n"
+ "st1b { z3.b }, p0, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x80\n"
+ "mov z6.b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "movprfx z16, z2\n smax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n smax z17.b, p4/M, z17.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "smax z16.b, p4/M, z16.b, z17.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
+ "movprfx z16, z2\n smax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n smax z17.b, p4/M, z17.b, z0.b\n"
+ "smax z16.b, p4/M, z16.b, z17.b\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x9]\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z6.b }, p3, [%x[outptr], x9]\n"
"incb x9\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 19a3b112ad..f09cbc9666 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -118,11 +118,11 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"cntb x26\n"
"cntb x25, ALL, MUL #2\n"
"cntb x24, ALL, MUL #3\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
- "whilelt p2.b, x25, %x[n_channels]\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
+ "whilelt p2.b, x26, %x[n_channels]\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -147,14 +147,14 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
@@ -164,24 +164,24 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
@@ -223,17 +223,17 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- "ld1b { z17.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z19.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x24]\n"
+ ".inst 0x4508a277 // sshllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508a676 // sshllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508a255 // sshllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508a654 // sshllt z20.h, z18.b, #0x0\n"
".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
@@ -254,115 +254,115 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
- "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
- "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
- ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
- ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
- ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
- ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x04b1756b // sqrdmulh z11.s, z11.s, z17.s\n"
- ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
- ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
- ".inst 0x04b1754a // sqrdmulh z10.s, z10.s, z17.s\n"
- ".inst 0x04b17529 // sqrdmulh z9.s, z9.s, z17.s\n"
- ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
- ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
- ".inst 0x04b17508 // sqrdmulh z8.s, z8.s, z17.s\n"
- ".inst 0x04b174e7 // sqrdmulh z7.s, z7.s, z17.s\n"
- ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
- ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
- ".inst 0x04b174c6 // sqrdmulh z6.s, z6.s, z17.s\n"
- ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
- ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
- ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
- ".inst 0x04b17484 // sqrdmulh z4.s, z4.s, z17.s\n"
- ".inst 0x04b17463 // sqrdmulh z3.s, z3.s, z17.s\n"
- ".inst 0x04b17442 // sqrdmulh z2.s, z2.s, z17.s\n"
- ".inst 0x04b17421 // sqrdmulh z1.s, z1.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x04b17400 // sqrdmulh z0.s, z0.s, z17.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
- ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
- ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
- ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
- ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
- ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
- ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
- ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
- ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
- ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
- ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
- ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "trn1 z17.h, z15.h, z14.h\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
- "smin z11.s, p0/M, z11.s, z18.s\n"
- "trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z10.s, p0/M, z10.s, z18.s\n"
- "smin z9.s, p0/M, z9.s, z18.s\n"
- "trn1 z17.h, z11.h, z10.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "smin z8.s, p0/M, z8.s, z18.s\n"
- "smin z7.s, p0/M, z7.s, z18.s\n"
+ "ld1rw { z19.s }, p4/Z, [%x[left_shift]]\n"
+ "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
+ "mov z20.s, #0x7f\n"
+ "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
+ "not z16.s, p4/M, z20.s\n"
+ ".inst 0x4482926f // srshl z15.s, p4/M, z15.s, z19.s\n"
+ ".inst 0x4482926e // srshl z14.s, p4/M, z14.s, z19.s\n"
+ ".inst 0x4482926d // srshl z13.s, p4/M, z13.s, z19.s\n"
+ ".inst 0x4482926c // srshl z12.s, p4/M, z12.s, z19.s\n"
+ ".inst 0x4482926b // srshl z11.s, p4/M, z11.s, z19.s\n"
+ ".inst 0x4482926a // srshl z10.s, p4/M, z10.s, z19.s\n"
+ ".inst 0x44829269 // srshl z9.s, p4/M, z9.s, z19.s\n"
+ ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x44829268 // srshl z8.s, p4/M, z8.s, z19.s\n"
+ ".inst 0x44829267 // srshl z7.s, p4/M, z7.s, z19.s\n"
+ ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x44829266 // srshl z6.s, p4/M, z6.s, z19.s\n"
+ ".inst 0x44829265 // srshl z5.s, p4/M, z5.s, z19.s\n"
+ ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b2756b // sqrdmulh z11.s, z11.s, z18.s\n"
+ ".inst 0x44829264 // srshl z4.s, p4/M, z4.s, z19.s\n"
+ ".inst 0x44829263 // srshl z3.s, p4/M, z3.s, z19.s\n"
+ ".inst 0x04b2754a // sqrdmulh z10.s, z10.s, z18.s\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ ".inst 0x44829262 // srshl z2.s, p4/M, z2.s, z19.s\n"
+ ".inst 0x44829261 // srshl z1.s, p4/M, z1.s, z19.s\n"
+ ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
+ ".inst 0x04b274e7 // sqrdmulh z7.s, z7.s, z18.s\n"
+ ".inst 0x44829260 // srshl z0.s, p4/M, z0.s, z19.s\n"
+ ".inst 0x04b274c6 // sqrdmulh z6.s, z6.s, z18.s\n"
+ ".inst 0x04b274a5 // sqrdmulh z5.s, z5.s, z18.s\n"
+ ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
+ ".inst 0x04b27484 // sqrdmulh z4.s, z4.s, z18.s\n"
+ ".inst 0x04b27463 // sqrdmulh z3.s, z3.s, z18.s\n"
+ ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
+ ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x04b27442 // sqrdmulh z2.s, z2.s, z18.s\n"
+ ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
+ ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
+ ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
+ ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
+ ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
+ ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
+ ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
+ ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
+ ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
+ ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
+ ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
+ ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
+ "smax z15.s, p4/M, z15.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z16.s\n"
+ "smax z13.s, p4/M, z13.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z16.s\n"
+ "smax z11.s, p4/M, z11.s, z16.s\n"
+ "smax z10.s, p4/M, z10.s, z16.s\n"
+ "smax z9.s, p4/M, z9.s, z16.s\n"
+ "smax z8.s, p4/M, z8.s, z16.s\n"
+ "smax z7.s, p4/M, z7.s, z16.s\n"
+ "smax z6.s, p4/M, z6.s, z16.s\n"
+ "smax z5.s, p4/M, z5.s, z16.s\n"
+ "smax z4.s, p4/M, z4.s, z16.s\n"
+ "smax z3.s, p4/M, z3.s, z16.s\n"
+ "smax z2.s, p4/M, z2.s, z16.s\n"
+ "smax z1.s, p4/M, z1.s, z16.s\n"
+ "smax z0.s, p4/M, z0.s, z16.s\n"
+ "smin z15.s, p4/M, z15.s, z20.s\n"
+ "smin z14.s, p4/M, z14.s, z20.s\n"
+ "smin z13.s, p4/M, z13.s, z20.s\n"
+ "smin z12.s, p4/M, z12.s, z20.s\n"
+ "smin z11.s, p4/M, z11.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z20.s\n"
+ "smin z9.s, p4/M, z9.s, z20.s\n"
+ "smin z8.s, p4/M, z8.s, z20.s\n"
+ "smin z7.s, p4/M, z7.s, z20.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z6.s, p4/M, z6.s, z20.s\n"
+ "smin z5.s, p4/M, z5.s, z20.s\n"
+ "trn1 z17.h, z13.h, z12.h\n"
+ "smin z4.s, p4/M, z4.s, z20.s\n"
+ "smin z3.s, p4/M, z3.s, z20.s\n"
+ "trn1 z18.h, z11.h, z10.h\n"
+ "smin z2.s, p4/M, z2.s, z20.s\n"
+ "smin z1.s, p4/M, z1.s, z20.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z6.s, p0/M, z6.s, z18.s\n"
- "smin z5.s, p0/M, z5.s, z18.s\n"
- "trn1 z17.h, z7.h, z6.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z4.s, p0/M, z4.s, z18.s\n"
- "smin z3.s, p0/M, z3.s, z18.s\n"
- "trn1 z16.h, z5.h, z4.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "trn1 z17.h, z3.h, z2.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
+ "smin z0.s, p4/M, z0.s, z20.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
"trn1 z16.h, z1.h, z0.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "incb x24, ALL, MUL #4\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
+ "st1b { z20.b }, p3, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x25]\n"
"incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -375,21 +375,21 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
- "add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
@@ -403,47 +403,47 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p3/Z, [x20, x27]\n"
".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
- "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z20.s }, p4/Z, [%x[left_shift]]\n"
+ "ld1rw { z19.s }, p4/Z, [%x[combined_rescale_value]]\n"
"mov z18.s, #0x7f\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
+ "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
+ "not z16.s, p4/M, z18.s\n"
+ ".inst 0x4482928f // srshl z15.s, p4/M, z15.s, z20.s\n"
+ ".inst 0x4482928e // srshl z14.s, p4/M, z14.s, z20.s\n"
+ ".inst 0x4482928d // srshl z13.s, p4/M, z13.s, z20.s\n"
+ ".inst 0x4482928c // srshl z12.s, p4/M, z12.s, z20.s\n"
+ ".inst 0x04b375ef // sqrdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x04b375ce // sqrdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b3758c // sqrdmulh z12.s, z12.s, z19.s\n"
+ ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
+ ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
+ ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ "smax z15.s, p4/M, z15.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z16.s\n"
+ "smax z13.s, p4/M, z13.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z16.s\n"
+ "smin z15.s, p4/M, z15.s, z18.s\n"
+ "smin z14.s, p4/M, z14.s, z18.s\n"
+ "smin z13.s, p4/M, z13.s, z18.s\n"
+ "smin z12.s, p4/M, z12.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x27]\n"
"incb x27\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 4fc1532d5a..5033aa9d73 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,337 +46,337 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
"cntb x28\n"
"cntb x27, ALL, MUL #2\n"
"cntb x26, ALL, MUL #3\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
- "whilelt p2.b, x27, %x[n_channels]\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
+ "whilelt p2.b, x28, %x[n_channels]\n"
+ "whilelt p1.b, x27, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x80\n"
- "mov z7.b, #0x80\n"
- "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
+ "mov z3.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
"mov z5.b, #0x80\n"
+ "mov z4.b, #0x80\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z19, z2\n smax z19.b, p4/M, z19.b, z1.b\n"
+ "smax z23.b, p4/M, z23.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
- "smax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
- "smax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
- "smax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "smax z17.b, p0/M, z17.b, z21.b\n"
- "smax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
+ "smax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
+ "smax z21.b, p4/M, z21.b, z26.b\n"
+ "smax z16.b, p4/M, z16.b, z25.b\n"
+ "smax z20.b, p4/M, z20.b, z24.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "smax z19.b, p4/M, z19.b, z23.b\n"
+ "smax z18.b, p4/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "smax z17.b, p4/M, z17.b, z21.b\n"
"subs x25, x25, #0x1\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "smax z7.b, p0/M, z7.b, z18.b\n"
- "smax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "smax z16.b, p4/M, z16.b, z20.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z3.b, p4/M, z3.b, z18.b\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "smax z5.b, p4/M, z5.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "smax z4.b, p4/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
- "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
- "smax z22.b, p0/M, z22.b, z30.b\n"
- "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
- "smax z21.b, p0/M, z21.b, z27.b\n"
- "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
- "smax z20.b, p0/M, z20.b, z24.b\n"
- "smax z19.b, p0/M, z19.b, z23.b\n"
- "smax z18.b, p0/M, z18.b, z22.b\n"
- "smax z17.b, p0/M, z17.b, z21.b\n"
- "smax z16.b, p0/M, z16.b, z20.b\n"
- "smax z8.b, p0/M, z8.b, z19.b\n"
- "smax z7.b, p0/M, z7.b, z18.b\n"
- "smax z6.b, p0/M, z6.b, z17.b\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "movprfx z19, z2\n smax z19.b, p4/M, z19.b, z1.b\n"
+ "smax z23.b, p4/M, z23.b, z0.b\n"
+ "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
+ "smax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
+ "smax z21.b, p4/M, z21.b, z26.b\n"
+ "smax z16.b, p4/M, z16.b, z25.b\n"
+ "smax z20.b, p4/M, z20.b, z24.b\n"
+ "smax z19.b, p4/M, z19.b, z23.b\n"
+ "smax z18.b, p4/M, z18.b, z22.b\n"
+ "smax z17.b, p4/M, z17.b, z21.b\n"
+ "smax z16.b, p4/M, z16.b, z20.b\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z3.b, p4/M, z3.b, z18.b\n"
+ "smax z5.b, p4/M, z5.b, z17.b\n"
+ "smax z4.b, p4/M, z4.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
- "ld1b { z17.b }, p3/Z, [x20, x28]\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "smax z7.b, p0/M, z7.b, z17.b\n"
- "smax z6.b, p0/M, z6.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "smax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z19.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x26]\n"
+ "smax z6.b, p4/M, z6.b, z19.b\n"
+ "smax z3.b, p4/M, z3.b, z18.b\n"
+ "smax z5.b, p4/M, z5.b, z17.b\n"
+ "smax z4.b, p4/M, z4.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- ".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508a517 // sshllt z23.h, z8.b, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z4.s }, p0/Z, [x20]\n"
- ".inst 0x4508a0f6 // sshllb z22.h, z7.b, #0x0\n"
- ".inst 0x4508a4f5 // sshllt z21.h, z7.b, #0x0\n"
+ ".inst 0x4508a0d3 // sshllb z19.h, z6.b, #0x0\n"
+ ".inst 0x4508a4d1 // sshllt z17.h, z6.b, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4508a0d4 // sshllb z20.h, z6.b, #0x0\n"
- ".inst 0x4508a4d3 // sshllt z19.h, z6.b, #0x0\n"
+ ".inst 0x4508a072 // sshllb z18.h, z3.b, #0x0\n"
+ ".inst 0x4508a478 // sshllt z24.h, z3.b, #0x0\n"
+ "ld1rw { z3.s }, p4/Z, [x21]\n"
+ "ld1rw { z2.s }, p4/Z, [x20]\n"
+ ".inst 0x4508a0b5 // sshllb z21.h, z5.b, #0x0\n"
+ ".inst 0x4508a4b7 // sshllt z23.h, z5.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- ".inst 0x4508a0b2 // sshllb z18.h, z5.b, #0x0\n"
- ".inst 0x4508a4b0 // sshllt z16.h, z5.b, #0x0\n"
- ".inst 0x4510a221 // sshllb z1.s, z17.h, #0x0\n"
+ ".inst 0x4508a096 // sshllb z22.h, z4.b, #0x0\n"
+ ".inst 0x4508a494 // sshllt z20.h, z4.b, #0x0\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ ".inst 0x4510a261 // sshllb z1.s, z19.h, #0x0\n"
+ ".inst 0x4510a673 // sshllt z19.s, z19.h, #0x0\n"
+ ".inst 0x4510a220 // sshllb z0.s, z17.h, #0x0\n"
".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
- ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
- ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
- ".inst 0x4510a2e0 // sshllb z0.s, z23.h, #0x0\n"
- ".inst 0x4510a6ff // sshllt z31.s, z23.h, #0x0\n"
- ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
- ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
- ".inst 0x4510a2de // sshllb z30.s, z22.h, #0x0\n"
- ".inst 0x4510a6dd // sshllt z29.s, z22.h, #0x0\n"
- ".inst 0x4482809e // srshl z30.s, p0/M, z30.s, z4.s\n"
- ".inst 0x4482809d // srshl z29.s, p0/M, z29.s, z4.s\n"
+ ".inst 0x4510a25f // sshllb z31.s, z18.h, #0x0\n"
+ ".inst 0x4510a652 // sshllt z18.s, z18.h, #0x0\n"
+ ".inst 0x4510a31e // sshllb z30.s, z24.h, #0x0\n"
+ ".inst 0x4510a71d // sshllt z29.s, z24.h, #0x0\n"
+ ".inst 0x44829061 // srshl z1.s, p4/M, z1.s, z3.s\n"
+ ".inst 0x44829073 // srshl z19.s, p4/M, z19.s, z3.s\n"
".inst 0x4510a2bc // sshllb z28.s, z21.h, #0x0\n"
- ".inst 0x4510a6bb // sshllt z27.s, z21.h, #0x0\n"
- ".inst 0x4482809c // srshl z28.s, p0/M, z28.s, z4.s\n"
- ".inst 0x4482809b // srshl z27.s, p0/M, z27.s, z4.s\n"
- ".inst 0x4510a29a // sshllb z26.s, z20.h, #0x0\n"
- ".inst 0x4510a699 // sshllt z25.s, z20.h, #0x0\n"
- ".inst 0x4482809a // srshl z26.s, p0/M, z26.s, z4.s\n"
- ".inst 0x44828099 // srshl z25.s, p0/M, z25.s, z4.s\n"
- ".inst 0x4510a278 // sshllb z24.s, z19.h, #0x0\n"
- ".inst 0x4510a677 // sshllt z23.s, z19.h, #0x0\n"
- ".inst 0x44828098 // srshl z24.s, p0/M, z24.s, z4.s\n"
- ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
- ".inst 0x4510a256 // sshllb z22.s, z18.h, #0x0\n"
- ".inst 0x4510a655 // sshllt z21.s, z18.h, #0x0\n"
- ".inst 0x44828096 // srshl z22.s, p0/M, z22.s, z4.s\n"
- ".inst 0x44828095 // srshl z21.s, p0/M, z21.s, z4.s\n"
- ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
- ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
- ".inst 0x44828094 // srshl z20.s, p0/M, z20.s, z4.s\n"
- ".inst 0x44828093 // srshl z19.s, p0/M, z19.s, z4.s\n"
- ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
- ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
- ".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
- ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
- ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
- ".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
- ".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
- ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
- ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
- ".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
- ".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
- ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
- ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
- ".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
- ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
- ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
- ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
- ".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
- ".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
- ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
- ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
- ".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
- ".inst 0x04a37673 // sqrdmulh z19.s, z19.s, z3.s\n"
- ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
- ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
- "mov z18.s, #0x7f\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "smax z30.s, p0/M, z30.s, z16.s\n"
- "smax z29.s, p0/M, z29.s, z16.s\n"
- "smax z28.s, p0/M, z28.s, z16.s\n"
- "smax z27.s, p0/M, z27.s, z16.s\n"
- "smax z26.s, p0/M, z26.s, z16.s\n"
- "smax z25.s, p0/M, z25.s, z16.s\n"
- "smax z24.s, p0/M, z24.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z22.s, p0/M, z22.s, z16.s\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
- "smax z20.s, p0/M, z20.s, z16.s\n"
- "smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z17.s, p0/M, z17.s, z18.s\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "trn1 z17.h, z1.h, z17.h\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "smin z30.s, p0/M, z30.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z29.s, p0/M, z29.s, z18.s\n"
- "smin z28.s, p0/M, z28.s, z18.s\n"
- "trn1 z17.h, z30.h, z29.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
- "smin z27.s, p0/M, z27.s, z18.s\n"
- "smin z26.s, p0/M, z26.s, z18.s\n"
- "trn1 z16.h, z28.h, z27.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z25.s, p0/M, z25.s, z18.s\n"
- "smin z24.s, p0/M, z24.s, z18.s\n"
- "trn1 z17.h, z26.h, z25.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x28]\n"
- "smin z23.s, p0/M, z23.s, z18.s\n"
- "smin z22.s, p0/M, z22.s, z18.s\n"
- "trn1 z16.h, z24.h, z23.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z21.s, p0/M, z21.s, z18.s\n"
- "smin z20.s, p0/M, z20.s, z18.s\n"
- "trn1 z17.h, z22.h, z21.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x27]\n"
- "smin z19.s, p0/M, z19.s, z18.s\n"
- "trn1 z16.h, z20.h, z19.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x26]\n"
- "incb x26, ALL, MUL #4\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
+ ".inst 0x4510a6b5 // sshllt z21.s, z21.h, #0x0\n"
+ ".inst 0x44829060 // srshl z0.s, p4/M, z0.s, z3.s\n"
+ ".inst 0x44829071 // srshl z17.s, p4/M, z17.s, z3.s\n"
+ ".inst 0x4510a2fb // sshllb z27.s, z23.h, #0x0\n"
+ ".inst 0x4510a6fa // sshllt z26.s, z23.h, #0x0\n"
+ ".inst 0x4482907f // srshl z31.s, p4/M, z31.s, z3.s\n"
+ ".inst 0x44829072 // srshl z18.s, p4/M, z18.s, z3.s\n"
+ ".inst 0x4510a2d9 // sshllb z25.s, z22.h, #0x0\n"
+ ".inst 0x4510a6d8 // sshllt z24.s, z22.h, #0x0\n"
+ ".inst 0x4482907e // srshl z30.s, p4/M, z30.s, z3.s\n"
+ ".inst 0x4482907d // srshl z29.s, p4/M, z29.s, z3.s\n"
+ ".inst 0x4510a297 // sshllb z23.s, z20.h, #0x0\n"
+ ".inst 0x4510a696 // sshllt z22.s, z20.h, #0x0\n"
+ ".inst 0x4482907c // srshl z28.s, p4/M, z28.s, z3.s\n"
+ ".inst 0x44829075 // srshl z21.s, p4/M, z21.s, z3.s\n"
+ ".inst 0x4482907b // srshl z27.s, p4/M, z27.s, z3.s\n"
+ ".inst 0x4482907a // srshl z26.s, p4/M, z26.s, z3.s\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ ".inst 0x04a27673 // sqrdmulh z19.s, z19.s, z2.s\n"
+ ".inst 0x44829079 // srshl z25.s, p4/M, z25.s, z3.s\n"
+ ".inst 0x44829078 // srshl z24.s, p4/M, z24.s, z3.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x44829077 // srshl z23.s, p4/M, z23.s, z3.s\n"
+ ".inst 0x44829076 // srshl z22.s, p4/M, z22.s, z3.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a27652 // sqrdmulh z18.s, z18.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ ".inst 0x04a277bd // sqrdmulh z29.s, z29.s, z2.s\n"
+ ".inst 0x44829201 // srshl z1.s, p4/M, z1.s, z16.s\n"
+ ".inst 0x44829213 // srshl z19.s, p4/M, z19.s, z16.s\n"
+ ".inst 0x04a2779c // sqrdmulh z28.s, z28.s, z2.s\n"
+ ".inst 0x04a276b5 // sqrdmulh z21.s, z21.s, z2.s\n"
+ ".inst 0x44829200 // srshl z0.s, p4/M, z0.s, z16.s\n"
+ ".inst 0x44829211 // srshl z17.s, p4/M, z17.s, z16.s\n"
+ ".inst 0x04a2777b // sqrdmulh z27.s, z27.s, z2.s\n"
+ ".inst 0x04a2775a // sqrdmulh z26.s, z26.s, z2.s\n"
+ ".inst 0x4482921f // srshl z31.s, p4/M, z31.s, z16.s\n"
+ ".inst 0x44829212 // srshl z18.s, p4/M, z18.s, z16.s\n"
+ ".inst 0x04a27739 // sqrdmulh z25.s, z25.s, z2.s\n"
+ ".inst 0x04a27718 // sqrdmulh z24.s, z24.s, z2.s\n"
+ ".inst 0x4482921e // srshl z30.s, p4/M, z30.s, z16.s\n"
+ ".inst 0x4482921d // srshl z29.s, p4/M, z29.s, z16.s\n"
+ ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ ".inst 0x4482921c // srshl z28.s, p4/M, z28.s, z16.s\n"
+ ".inst 0x44829215 // srshl z21.s, p4/M, z21.s, z16.s\n"
+ "mov z20.s, #0x7f\n"
+ ".inst 0x4482921b // srshl z27.s, p4/M, z27.s, z16.s\n"
+ ".inst 0x4482921a // srshl z26.s, p4/M, z26.s, z16.s\n"
+ ".inst 0x44829219 // srshl z25.s, p4/M, z25.s, z16.s\n"
+ ".inst 0x44829218 // srshl z24.s, p4/M, z24.s, z16.s\n"
+ ".inst 0x44829217 // srshl z23.s, p4/M, z23.s, z16.s\n"
+ ".inst 0x44829216 // srshl z22.s, p4/M, z22.s, z16.s\n"
+ "not z16.s, p4/M, z20.s\n"
+ "smax z1.s, p4/M, z1.s, z16.s\n"
+ "smax z19.s, p4/M, z19.s, z16.s\n"
+ "smax z0.s, p4/M, z0.s, z16.s\n"
+ "smax z17.s, p4/M, z17.s, z16.s\n"
+ "smax z31.s, p4/M, z31.s, z16.s\n"
+ "smax z18.s, p4/M, z18.s, z16.s\n"
+ "smax z30.s, p4/M, z30.s, z16.s\n"
+ "smax z29.s, p4/M, z29.s, z16.s\n"
+ "smax z28.s, p4/M, z28.s, z16.s\n"
+ "smax z21.s, p4/M, z21.s, z16.s\n"
+ "smax z27.s, p4/M, z27.s, z16.s\n"
+ "smax z26.s, p4/M, z26.s, z16.s\n"
+ "smax z25.s, p4/M, z25.s, z16.s\n"
+ "smax z24.s, p4/M, z24.s, z16.s\n"
+ "smax z23.s, p4/M, z23.s, z16.s\n"
+ "smax z22.s, p4/M, z22.s, z16.s\n"
+ "smin z1.s, p4/M, z1.s, z20.s\n"
+ "smin z19.s, p4/M, z19.s, z20.s\n"
+ "smin z0.s, p4/M, z0.s, z20.s\n"
+ "smin z17.s, p4/M, z17.s, z20.s\n"
+ "smin z31.s, p4/M, z31.s, z20.s\n"
+ "smin z18.s, p4/M, z18.s, z20.s\n"
+ "smin z30.s, p4/M, z30.s, z20.s\n"
+ "smin z29.s, p4/M, z29.s, z20.s\n"
+ "smin z28.s, p4/M, z28.s, z20.s\n"
+ "trn1 z19.h, z1.h, z19.h\n"
+ "smin z21.s, p4/M, z21.s, z20.s\n"
+ "smin z27.s, p4/M, z27.s, z20.s\n"
+ "trn1 z17.h, z0.h, z17.h\n"
+ "smin z26.s, p4/M, z26.s, z20.s\n"
+ "smin z25.s, p4/M, z25.s, z20.s\n"
+ "trn1 z18.h, z31.h, z18.h\n"
+ "smin z24.s, p4/M, z24.s, z20.s\n"
+ "smin z23.s, p4/M, z23.s, z20.s\n"
+ "trn1 z16.h, z30.h, z29.h\n"
+ "smin z22.s, p4/M, z22.s, z20.s\n"
+ "trn1 z21.h, z28.h, z21.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z27.h, z26.h\n"
+ "trn1 z19.h, z25.h, z24.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
+ "trn1 z16.h, z23.h, z22.h\n"
+ "st1b { z20.b }, p3, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x80\n"
+ "mov z6.b, #0x80\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "movprfx z16, z2\n smax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n smax z17.b, p4/M, z17.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "smax z16.b, p4/M, z16.b, z17.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
- "smax z16.b, p0/M, z16.b, z17.b\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
+ "movprfx z16, z2\n smax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n smax z17.b, p4/M, z17.b, z0.b\n"
+ "smax z16.b, p4/M, z16.b, z17.b\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x9]\n"
+ "smax z6.b, p4/M, z6.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- ".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508a512 // sshllt z18.h, z8.b, #0x0\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
- ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
+ ".inst 0x4508a0d1 // sshllb z17.h, z6.b, #0x0\n"
+ ".inst 0x4508a4d0 // sshllt z16.h, z6.b, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x4510a254 // sshllb z20.s, z18.h, #0x0\n"
- ".inst 0x4510a653 // sshllt z19.s, z18.h, #0x0\n"
- ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
- ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
- ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
- ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
- ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
- ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ "ld1rw { z24.s }, p4/Z, [x21]\n"
+ "ld1rw { z23.s }, p4/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
- ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
- "mov z18.s, #0x7f\n"
- ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
- ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
- ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
- ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
- "not z16.s, p0/M, z18.s\n"
- "smax z22.s, p0/M, z22.s, z16.s\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
- "smax z20.s, p0/M, z20.s, z16.s\n"
- "smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z22.s, p0/M, z22.s, z18.s\n"
- "smin z21.s, p0/M, z21.s, z18.s\n"
- "smin z20.s, p0/M, z20.s, z18.s\n"
- "trn1 z17.h, z22.h, z21.h\n"
- "smin z19.s, p0/M, z19.s, z18.s\n"
- "trn1 z16.h, z20.h, z19.h\n"
+ "mov z22.s, #0x7f\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
+ ".inst 0x4510a234 // sshllb z20.s, z17.h, #0x0\n"
+ ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
+ ".inst 0x4510a213 // sshllb z19.s, z16.h, #0x0\n"
+ ".inst 0x4510a612 // sshllt z18.s, z16.h, #0x0\n"
+ "not z16.s, p4/M, z22.s\n"
+ ".inst 0x44829314 // srshl z20.s, p4/M, z20.s, z24.s\n"
+ ".inst 0x44829311 // srshl z17.s, p4/M, z17.s, z24.s\n"
+ ".inst 0x44829313 // srshl z19.s, p4/M, z19.s, z24.s\n"
+ ".inst 0x44829312 // srshl z18.s, p4/M, z18.s, z24.s\n"
+ ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n"
+ ".inst 0x04b77631 // sqrdmulh z17.s, z17.s, z23.s\n"
+ ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n"
+ ".inst 0x04b77652 // sqrdmulh z18.s, z18.s, z23.s\n"
+ ".inst 0x448292b4 // srshl z20.s, p4/M, z20.s, z21.s\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ ".inst 0x448292b3 // srshl z19.s, p4/M, z19.s, z21.s\n"
+ ".inst 0x448292b2 // srshl z18.s, p4/M, z18.s, z21.s\n"
+ "smax z20.s, p4/M, z20.s, z16.s\n"
+ "smax z17.s, p4/M, z17.s, z16.s\n"
+ "smax z19.s, p4/M, z19.s, z16.s\n"
+ "smax z18.s, p4/M, z18.s, z16.s\n"
+ "smin z20.s, p4/M, z20.s, z22.s\n"
+ "smin z17.s, p4/M, z17.s, z22.s\n"
+ "smin z19.s, p4/M, z19.s, z22.s\n"
+ "smin z18.s, p4/M, z18.s, z22.s\n"
+ "trn1 z17.h, z20.h, z17.h\n"
+ "trn1 z16.h, z19.h, z18.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x9]\n"
"incb x9\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index f3f4950a1f..f07acd8734 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,11 +99,11 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"cntb x26\n"
"cntb x25, ALL, MUL #2\n"
"cntb x24, ALL, MUL #3\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
- "whilelt p2.b, x25, %x[n_channels]\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
+ "whilelt p2.b, x26, %x[n_channels]\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -128,14 +128,14 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
@@ -145,24 +145,24 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
@@ -204,17 +204,17 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- "ld1b { z17.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z19.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x24]\n"
+ ".inst 0x4508aa77 // ushllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508ae76 // ushllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508aa55 // ushllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508ae54 // ushllt z20.h, z18.b, #0x0\n"
".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
@@ -235,98 +235,98 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
- ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
- ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
- ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
- ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
- ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
- ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
- ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
- ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
- ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
- ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
- ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
- ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
- ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
- ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
- ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
- ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
- ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
- ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
- ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
- ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
- ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
- ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
- ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "ld1rw { z18.s }, p4/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z17.s }, p4/Z, [%x[shift_ptr]]\n"
"mov z16.s, #0x0\n"
- "mov z18.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
- "trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z11.s, p0/M, z11.s, z18.s\n"
- "smin z10.s, p0/M, z10.s, z18.s\n"
- "trn1 z17.h, z11.h, z10.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "smin z9.s, p0/M, z9.s, z18.s\n"
- "smin z8.s, p0/M, z8.s, z18.s\n"
+ "mov z20.s, #0xff\n"
+ ".inst 0x04b275ef // sqdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b2756b // sqdmulh z11.s, z11.s, z18.s\n"
+ ".inst 0x04b2754a // sqdmulh z10.s, z10.s, z18.s\n"
+ ".inst 0x04b27529 // sqdmulh z9.s, z9.s, z18.s\n"
+ ".inst 0x04b27508 // sqdmulh z8.s, z8.s, z18.s\n"
+ ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
+ ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
+ ".inst 0x04b274e7 // sqdmulh z7.s, z7.s, z18.s\n"
+ ".inst 0x04b274c6 // sqdmulh z6.s, z6.s, z18.s\n"
+ ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ ".inst 0x04b274a5 // sqdmulh z5.s, z5.s, z18.s\n"
+ ".inst 0x04b27484 // sqdmulh z4.s, z4.s, z18.s\n"
+ ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
+ ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
+ ".inst 0x04b27463 // sqdmulh z3.s, z3.s, z18.s\n"
+ ".inst 0x04b27442 // sqdmulh z2.s, z2.s, z18.s\n"
+ ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
+ ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
+ ".inst 0x04b27421 // sqdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27400 // sqdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
+ ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
+ ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
+ ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
+ ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
+ ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
+ ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
+ ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
+ "smax z15.s, p4/M, z15.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z16.s\n"
+ "smax z13.s, p4/M, z13.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z16.s\n"
+ "smax z11.s, p4/M, z11.s, z16.s\n"
+ "smax z10.s, p4/M, z10.s, z16.s\n"
+ "smax z9.s, p4/M, z9.s, z16.s\n"
+ "smax z8.s, p4/M, z8.s, z16.s\n"
+ "smax z7.s, p4/M, z7.s, z16.s\n"
+ "smax z6.s, p4/M, z6.s, z16.s\n"
+ "smax z5.s, p4/M, z5.s, z16.s\n"
+ "smax z4.s, p4/M, z4.s, z16.s\n"
+ "smax z3.s, p4/M, z3.s, z16.s\n"
+ "smax z2.s, p4/M, z2.s, z16.s\n"
+ "smax z1.s, p4/M, z1.s, z16.s\n"
+ "smax z0.s, p4/M, z0.s, z16.s\n"
+ "smin z15.s, p4/M, z15.s, z20.s\n"
+ "smin z14.s, p4/M, z14.s, z20.s\n"
+ "smin z13.s, p4/M, z13.s, z20.s\n"
+ "smin z12.s, p4/M, z12.s, z20.s\n"
+ "smin z11.s, p4/M, z11.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z20.s\n"
+ "smin z9.s, p4/M, z9.s, z20.s\n"
+ "smin z8.s, p4/M, z8.s, z20.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z7.s, p4/M, z7.s, z20.s\n"
+ "smin z6.s, p4/M, z6.s, z20.s\n"
+ "trn1 z17.h, z13.h, z12.h\n"
+ "smin z5.s, p4/M, z5.s, z20.s\n"
+ "smin z4.s, p4/M, z4.s, z20.s\n"
+ "trn1 z18.h, z11.h, z10.h\n"
+ "smin z3.s, p4/M, z3.s, z20.s\n"
+ "smin z2.s, p4/M, z2.s, z20.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z7.s, p0/M, z7.s, z18.s\n"
- "smin z6.s, p0/M, z6.s, z18.s\n"
- "trn1 z17.h, z7.h, z6.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z5.s, p0/M, z5.s, z18.s\n"
- "smin z4.s, p0/M, z4.s, z18.s\n"
- "trn1 z16.h, z5.h, z4.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z3.s, p0/M, z3.s, z18.s\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "trn1 z17.h, z3.h, z2.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
+ "smin z1.s, p4/M, z1.s, z20.s\n"
+ "smin z0.s, p4/M, z0.s, z20.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
"trn1 z16.h, z1.h, z0.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "incb x24, ALL, MUL #4\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
+ "st1b { z20.b }, p3, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x25]\n"
"incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x23, %x[n_valid_cells], #0x1\n"
@@ -339,21 +339,21 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
- "add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
@@ -367,42 +367,42 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p3/Z, [x20, x27]\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
- ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
- ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
- ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
- ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
- ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
- ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
- ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z19.s }, p4/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z18.s }, p4/Z, [%x[shift_ptr]]\n"
"mov z17.s, #0x0\n"
"mov z16.s, #0xff\n"
- "smax z15.s, p0/M, z15.s, z17.s\n"
- "smax z14.s, p0/M, z14.s, z17.s\n"
- "smax z13.s, p0/M, z13.s, z17.s\n"
- "smax z12.s, p0/M, z12.s, z17.s\n"
- "smin z15.s, p0/M, z15.s, z16.s\n"
- "smin z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x04b375ef // sqdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x04b375ce // sqdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x04b3758c // sqdmulh z12.s, z12.s, z19.s\n"
+ ".inst 0x4482924f // srshl z15.s, p4/M, z15.s, z18.s\n"
+ ".inst 0x4482924e // srshl z14.s, p4/M, z14.s, z18.s\n"
+ ".inst 0x4482924d // srshl z13.s, p4/M, z13.s, z18.s\n"
+ ".inst 0x4482924c // srshl z12.s, p4/M, z12.s, z18.s\n"
+ "smax z15.s, p4/M, z15.s, z17.s\n"
+ "smax z14.s, p4/M, z14.s, z17.s\n"
+ "smax z13.s, p4/M, z13.s, z17.s\n"
+ "smax z12.s, p4/M, z12.s, z17.s\n"
+ "smin z15.s, p4/M, z15.s, z16.s\n"
+ "smin z14.s, p4/M, z14.s, z16.s\n"
+ "smin z13.s, p4/M, z13.s, z16.s\n"
+ "smin z12.s, p4/M, z12.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z16.s\n"
- "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x27]\n"
"incb x27\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 8612555bfb..74dfac4133 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,22 +66,22 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"ldr x15, [%x[args], %[offsetof_n_channels]]\n"
"ldr x21, [%x[args], %[offsetof_outptrs]]\n"
"mov x14, #0x0\n"
- "whilelt p0.b, x14, x15\n"
- "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
- "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "mov x11, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x21, #0x0]\n"
"ldp x10, x9, [x21, #0x10]\n"
+ "whilelt p0.b, x14, x15\n"
"ldp x28, x27, [x20, #0x0]\n"
"ldp x26, x25, [x20, #0x10]\n"
"ldp x24, x23, [x20, #0x20]\n"
"ldp x22, x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x40]\n"
"ld1b { z31.b }, p0/Z, [x27, x14]\n"
- "ld1b { z30.b }, p0/Z, [x24, x14]\n"
- "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
"ld1b { z28.b }, p0/Z, [x25, x14]\n"
- "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x14]\n"
"ld1b { z26.b }, p0/Z, [x26, x14]\n"
"ld1b { z25.b }, p0/Z, [x23, x14]\n"
"ld1b { z24.b }, p0/Z, [x22, x14]\n"
@@ -90,50 +90,50 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
"whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z29.b\n"
+ "movprfx z21, z29\n umax z21.b, p2/M, z21.b, z27.b\n"
"ld1b { z31.b }, p1/Z, [x27, x14]\n"
- "ld1b { z30.b }, p1/Z, [x24, x14]\n"
- "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
- "ld1b { z29.b }, p1/Z, [x21, x14]\n"
- "ld1b { z27.b }, p1/Z, [x28, x14]\n"
- "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z29.b }, p1/Z, [x24, x14]\n"
+ "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z30.b\n"
+ "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
+ "ld1b { z27.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x28, x14]\n"
+ "movprfx z16, z28\n umax z16.b, p2/M, z16.b, z24.b\n"
+ "movprfx z20, z25\n umax z20.b, p2/M, z20.b, z23.b\n"
"ld1b { z28.b }, p1/Z, [x25, x14]\n"
"ld1b { z26.b }, p1/Z, [x26, x14]\n"
"ld1b { z25.b }, p1/Z, [x23, x14]\n"
"ld1b { z24.b }, p1/Z, [x22, x14]\n"
- "whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "whilelt p0.b, x13, x15\n"
"ld1b { z23.b }, p1/Z, [x20, x14]\n"
"incw x14\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
"whilelt p1.b, x14, x15\n"
- "st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
- "umax z17.b, p2/M, z17.b, z21.b\n"
- "st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
- "st1b { z17.b }, p0, [x10, x11]\n"
- "st1b { z16.b }, p0, [x9, x11]\n"
- "incw x11\n"
+ "st1b { z19.b }, p0, [x12, x13]\n"
+ "st1b { z18.b }, p0, [x11, x13]\n"
+ "st1b { z17.b }, p0, [x10, x13]\n"
+ "st1b { z16.b }, p0, [x9, x13]\n"
+ "incw x13\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
- "movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
- "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
- "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
- "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
- "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
- "whilelt p0.b, x11, x15\n"
- "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z16.b }, p0, [x13, x11]\n"
- "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
- "umax z17.b, p2/M, z17.b, z21.b\n"
- "st1b { z16.b }, p0, [x12, x11]\n"
- "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
- "st1b { z17.b }, p0, [x10, x11]\n"
- "st1b { z16.b }, p0, [x9, x11]\n"
+ "movprfx z22, z31\n umax z22.b, p2/M, z22.b, z29.b\n"
+ "movprfx z21, z29\n umax z21.b, p2/M, z21.b, z27.b\n"
+ "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z30.b\n"
+ "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
+ "movprfx z16, z28\n umax z16.b, p2/M, z16.b, z24.b\n"
+ "movprfx z20, z25\n umax z20.b, p2/M, z20.b, z23.b\n"
+ "whilelt p0.b, x13, x15\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z19.b }, p0, [x12, x13]\n"
+ "st1b { z18.b }, p0, [x11, x13]\n"
+ "st1b { z17.b }, p0, [x10, x13]\n"
+ "st1b { z16.b }, p0, [x9, x13]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index be0eb398ae..340a35a5f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -44,176 +44,176 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
"cntb x28\n"
"cntb x27, ALL, MUL #2\n"
"cntb x26, ALL, MUL #3\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
- "whilelt p2.b, x27, %x[n_channels]\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
+ "whilelt p2.b, x28, %x[n_channels]\n"
+ "whilelt p1.b, x27, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x0\n"
- "mov z7.b, #0x0\n"
- "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z4.b, #0x0\n"
+ "mov z3.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z19, z2\n umax z19.b, p4/M, z19.b, z1.b\n"
+ "umax z23.b, p4/M, z23.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
- "umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
- "umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
- "umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "umax z17.b, p0/M, z17.b, z21.b\n"
- "umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
+ "umax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
+ "umax z21.b, p4/M, z21.b, z26.b\n"
+ "umax z16.b, p4/M, z16.b, z25.b\n"
+ "umax z20.b, p4/M, z20.b, z24.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "umax z19.b, p4/M, z19.b, z23.b\n"
+ "umax z18.b, p4/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "umax z17.b, p4/M, z17.b, z21.b\n"
"subs x25, x25, #0x1\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "umax z7.b, p0/M, z7.b, z18.b\n"
- "umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "umax z16.b, p4/M, z16.b, z20.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
- "umax z22.b, p0/M, z22.b, z30.b\n"
- "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
- "umax z21.b, p0/M, z21.b, z27.b\n"
- "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
- "umax z20.b, p0/M, z20.b, z24.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z18.b, p0/M, z18.b, z22.b\n"
- "umax z17.b, p0/M, z17.b, z21.b\n"
- "umax z16.b, p0/M, z16.b, z20.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "umax z7.b, p0/M, z7.b, z18.b\n"
- "umax z6.b, p0/M, z6.b, z17.b\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "movprfx z19, z2\n umax z19.b, p4/M, z19.b, z1.b\n"
+ "umax z23.b, p4/M, z23.b, z0.b\n"
+ "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
+ "umax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
+ "umax z21.b, p4/M, z21.b, z26.b\n"
+ "umax z16.b, p4/M, z16.b, z25.b\n"
+ "umax z20.b, p4/M, z20.b, z24.b\n"
+ "umax z19.b, p4/M, z19.b, z23.b\n"
+ "umax z18.b, p4/M, z18.b, z22.b\n"
+ "umax z17.b, p4/M, z17.b, z21.b\n"
+ "umax z16.b, p4/M, z16.b, z20.b\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
- "ld1b { z17.b }, p3/Z, [x20, x28]\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "umax z7.b, p0/M, z7.b, z17.b\n"
- "umax z6.b, p0/M, z6.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z19.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x26]\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z6.b }, p3, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
- "st1b { z7.b }, p3, [%x[outptr], x28]\n"
+ "st1b { z5.b }, p2, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
- "st1b { z6.b }, p2, [%x[outptr], x27]\n"
+ "st1b { z4.b }, p1, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
- "st1b { z5.b }, p1, [%x[outptr], x26]\n"
+ "st1b { z3.b }, p0, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x0\n"
+ "mov z6.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "movprfx z16, z2\n umax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n umax z17.b, p4/M, z17.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "umax z16.b, p4/M, z16.b, z17.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
+ "movprfx z16, z2\n umax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n umax z17.b, p4/M, z17.b, z0.b\n"
+ "umax z16.b, p4/M, z16.b, z17.b\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x9]\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z6.b }, p3, [%x[outptr], x9]\n"
"incb x9\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index e8339a2cd9..db90c8a3a2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -123,20 +123,20 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"cntb x26\n"
"cntb x25, ALL, MUL #2\n"
"cntb x24, ALL, MUL #3\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
- "whilelt p2.b, x25, %x[n_channels]\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
+ "whilelt p2.b, x26, %x[n_channels]\n"
+ "whilelt p1.b, x25, %x[n_channels]\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
- "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
- "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -152,14 +152,14 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
@@ -169,24 +169,24 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
- "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z28.b }, p3/Z, [x20, x26]\n"
- "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z26.b }, p2/Z, [x20, x25]\n"
- "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z29.b }, p2/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p2/Z, [x20, x26]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
- "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z27.b }, p1/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x25]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z25.b }, p0/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x24]\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
@@ -228,17 +228,17 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
- ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
- ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p3/Z, [x20, x26]\n"
- "ld1b { z17.b }, p2/Z, [x20, x25]\n"
- ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
- ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
- "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z19.b }, p3/Z, [x20, x27]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x25]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x24]\n"
+ ".inst 0x4508aa77 // ushllb z23.h, z19.b, #0x0\n"
+ ".inst 0x4508ae76 // ushllt z22.h, z19.b, #0x0\n"
+ ".inst 0x4508aa55 // ushllb z21.h, z18.b, #0x0\n"
+ ".inst 0x4508ae54 // ushllt z20.h, z18.b, #0x0\n"
".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
@@ -259,160 +259,160 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
- ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
- ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
- ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
- "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
- ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
- ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
- ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
- ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
- ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
- ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
- ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
- ".inst 0x04b0756b // sqrdmulh z11.s, z11.s, z16.s\n"
- ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
- ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
- ".inst 0x04b0754a // sqrdmulh z10.s, z10.s, z16.s\n"
- ".inst 0x04b07529 // sqrdmulh z9.s, z9.s, z16.s\n"
- ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
- ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
- ".inst 0x04b07508 // sqrdmulh z8.s, z8.s, z16.s\n"
- ".inst 0x04b074e7 // sqrdmulh z7.s, z7.s, z16.s\n"
- ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
- ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
- ".inst 0x04b074c6 // sqrdmulh z6.s, z6.s, z16.s\n"
- ".inst 0x04b074a5 // sqrdmulh z5.s, z5.s, z16.s\n"
- ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
- ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
- ".inst 0x04b07484 // sqrdmulh z4.s, z4.s, z16.s\n"
- ".inst 0x04b07463 // sqrdmulh z3.s, z3.s, z16.s\n"
- ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
- ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "ld1rw { z21.s }, p4/Z, [%x[left_shift]]\n"
+ "ld1rw { z19.s }, p4/Z, [%x[combined_rescale_value]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
- ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
- ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
- ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
- ".inst 0x4482822b // srshl z11.s, p0/M, z11.s, z17.s\n"
+ "mov z18.s, #0x0\n"
+ "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
+ "ld1rw { z16.s }, p4/Z, [x20]\n"
+ "mov z20.s, #0xff\n"
+ ".inst 0x448292af // srshl z15.s, p4/M, z15.s, z21.s\n"
+ ".inst 0x448292ae // srshl z14.s, p4/M, z14.s, z21.s\n"
+ ".inst 0x448292ad // srshl z13.s, p4/M, z13.s, z21.s\n"
+ ".inst 0x448292ac // srshl z12.s, p4/M, z12.s, z21.s\n"
+ ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n"
+ ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n"
+ ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n"
+ ".inst 0x04b375ef // sqrdmulh z15.s, z15.s, z19.s\n"
+ ".inst 0x448292a8 // srshl z8.s, p4/M, z8.s, z21.s\n"
+ ".inst 0x448292a7 // srshl z7.s, p4/M, z7.s, z21.s\n"
+ ".inst 0x04b375ce // sqrdmulh z14.s, z14.s, z19.s\n"
+ ".inst 0x04b375ad // sqrdmulh z13.s, z13.s, z19.s\n"
+ ".inst 0x448292a6 // srshl z6.s, p4/M, z6.s, z21.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x04b3758c // sqrdmulh z12.s, z12.s, z19.s\n"
+ ".inst 0x04b3756b // sqrdmulh z11.s, z11.s, z19.s\n"
+ ".inst 0x448292a4 // srshl z4.s, p4/M, z4.s, z21.s\n"
+ ".inst 0x448292a3 // srshl z3.s, p4/M, z3.s, z21.s\n"
+ ".inst 0x04b3754a // sqrdmulh z10.s, z10.s, z19.s\n"
+ ".inst 0x04b37529 // sqrdmulh z9.s, z9.s, z19.s\n"
+ ".inst 0x448292a2 // srshl z2.s, p4/M, z2.s, z21.s\n"
+ ".inst 0x448292a1 // srshl z1.s, p4/M, z1.s, z21.s\n"
+ ".inst 0x04b37508 // sqrdmulh z8.s, z8.s, z19.s\n"
+ ".inst 0x04b374e7 // sqrdmulh z7.s, z7.s, z19.s\n"
+ ".inst 0x448292a0 // srshl z0.s, p4/M, z0.s, z21.s\n"
+ ".inst 0x04b374c6 // sqrdmulh z6.s, z6.s, z19.s\n"
+ ".inst 0x04b374a5 // sqrdmulh z5.s, z5.s, z19.s\n"
+ ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
+ ".inst 0x04b37484 // sqrdmulh z4.s, z4.s, z19.s\n"
+ ".inst 0x04b37463 // sqrdmulh z3.s, z3.s, z19.s\n"
+ ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
+ ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
+ ".inst 0x04b37442 // sqrdmulh z2.s, z2.s, z19.s\n"
+ ".inst 0x04b37421 // sqrdmulh z1.s, z1.s, z19.s\n"
+ ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
+ ".inst 0x04b37400 // sqrdmulh z0.s, z0.s, z19.s\n"
+ ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
+ ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
"add z15.s, z15.s, z16.s\n"
+ ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
+ ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
"add z14.s, z14.s, z16.s\n"
- ".inst 0x4482822a // srshl z10.s, p0/M, z10.s, z17.s\n"
- ".inst 0x44828229 // srshl z9.s, p0/M, z9.s, z17.s\n"
"add z13.s, z13.s, z16.s\n"
+ ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
+ ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
"add z12.s, z12.s, z16.s\n"
- ".inst 0x44828228 // srshl z8.s, p0/M, z8.s, z17.s\n"
- ".inst 0x44828227 // srshl z7.s, p0/M, z7.s, z17.s\n"
"add z11.s, z11.s, z16.s\n"
+ ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
+ ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
"add z10.s, z10.s, z16.s\n"
- ".inst 0x44828226 // srshl z6.s, p0/M, z6.s, z17.s\n"
- ".inst 0x44828225 // srshl z5.s, p0/M, z5.s, z17.s\n"
"add z9.s, z9.s, z16.s\n"
+ ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
+ ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
"add z8.s, z8.s, z16.s\n"
- ".inst 0x44828224 // srshl z4.s, p0/M, z4.s, z17.s\n"
- ".inst 0x44828223 // srshl z3.s, p0/M, z3.s, z17.s\n"
"add z7.s, z7.s, z16.s\n"
+ ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
"add z6.s, z6.s, z16.s\n"
- ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
- ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
"add z5.s, z5.s, z16.s\n"
+ "smax z15.s, p4/M, z15.s, z18.s\n"
"add z4.s, z4.s, z16.s\n"
- ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
"add z3.s, z3.s, z16.s\n"
+ "smax z14.s, p4/M, z14.s, z18.s\n"
+ "smax z13.s, p4/M, z13.s, z18.s\n"
"add z2.s, z2.s, z16.s\n"
"add z1.s, z1.s, z16.s\n"
+ "smax z12.s, p4/M, z12.s, z18.s\n"
+ "smax z11.s, p4/M, z11.s, z18.s\n"
"add z0.s, z0.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "smax z15.s, p0/M, z15.s, z16.s\n"
- "smax z14.s, p0/M, z14.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smax z13.s, p0/M, z13.s, z16.s\n"
- "smax z12.s, p0/M, z12.s, z16.s\n"
- "smax z11.s, p0/M, z11.s, z16.s\n"
- "smax z10.s, p0/M, z10.s, z16.s\n"
- "smax z9.s, p0/M, z9.s, z16.s\n"
- "smax z8.s, p0/M, z8.s, z16.s\n"
- "smax z7.s, p0/M, z7.s, z16.s\n"
- "smax z6.s, p0/M, z6.s, z16.s\n"
- "smax z5.s, p0/M, z5.s, z16.s\n"
- "smax z4.s, p0/M, z4.s, z16.s\n"
- "smax z3.s, p0/M, z3.s, z16.s\n"
- "smax z2.s, p0/M, z2.s, z16.s\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smin z15.s, p0/M, z15.s, z18.s\n"
- "smin z14.s, p0/M, z14.s, z18.s\n"
- "trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z18.s\n"
- "smin z12.s, p0/M, z12.s, z18.s\n"
- "trn1 z16.h, z13.h, z12.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z11.s, p0/M, z11.s, z18.s\n"
- "smin z10.s, p0/M, z10.s, z18.s\n"
- "trn1 z17.h, z11.h, z10.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
- "smin z9.s, p0/M, z9.s, z18.s\n"
- "smin z8.s, p0/M, z8.s, z18.s\n"
+ "smax z10.s, p4/M, z10.s, z18.s\n"
+ "smax z9.s, p4/M, z9.s, z18.s\n"
+ "smax z8.s, p4/M, z8.s, z18.s\n"
+ "smax z7.s, p4/M, z7.s, z18.s\n"
+ "smax z6.s, p4/M, z6.s, z18.s\n"
+ "smax z5.s, p4/M, z5.s, z18.s\n"
+ "smax z4.s, p4/M, z4.s, z18.s\n"
+ "smax z3.s, p4/M, z3.s, z18.s\n"
+ "smax z2.s, p4/M, z2.s, z18.s\n"
+ "smax z1.s, p4/M, z1.s, z18.s\n"
+ "smax z0.s, p4/M, z0.s, z18.s\n"
+ "smin z15.s, p4/M, z15.s, z20.s\n"
+ "smin z14.s, p4/M, z14.s, z20.s\n"
+ "smin z13.s, p4/M, z13.s, z20.s\n"
+ "smin z12.s, p4/M, z12.s, z20.s\n"
+ "smin z11.s, p4/M, z11.s, z20.s\n"
+ "smin z10.s, p4/M, z10.s, z20.s\n"
+ "smin z9.s, p4/M, z9.s, z20.s\n"
+ "smin z8.s, p4/M, z8.s, z20.s\n"
+ "smin z7.s, p4/M, z7.s, z20.s\n"
+ "trn1 z19.h, z15.h, z14.h\n"
+ "smin z6.s, p4/M, z6.s, z20.s\n"
+ "smin z5.s, p4/M, z5.s, z20.s\n"
+ "trn1 z17.h, z13.h, z12.h\n"
+ "smin z4.s, p4/M, z4.s, z20.s\n"
+ "smin z3.s, p4/M, z3.s, z20.s\n"
+ "trn1 z18.h, z11.h, z10.h\n"
+ "smin z2.s, p4/M, z2.s, z20.s\n"
+ "smin z1.s, p4/M, z1.s, z20.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z7.s, p0/M, z7.s, z18.s\n"
- "smin z6.s, p0/M, z6.s, z18.s\n"
- "trn1 z17.h, z7.h, z6.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z5.s, p0/M, z5.s, z18.s\n"
- "smin z4.s, p0/M, z4.s, z18.s\n"
- "trn1 z16.h, z5.h, z4.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z3.s, p0/M, z3.s, z18.s\n"
- "smin z2.s, p0/M, z2.s, z18.s\n"
- "trn1 z17.h, z3.h, z2.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
+ "smin z0.s, p4/M, z0.s, z20.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "trn1 z19.h, z3.h, z2.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
"trn1 z16.h, z1.h, z0.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "incb x24, ALL, MUL #4\n"
- "whilelt p1.b, x24, %x[n_channels]\n"
+ "st1b { z20.b }, p3, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x25]\n"
"incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p0.b, x24, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
"lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z14.d, z15.d\n"
"mov z13.d, z15.d\n"
"mov z12.d, z15.d\n"
- "mov x22, %x[inptrs]\n"
"cbz x23, 11f\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
"add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
"ldp x21, x20, [x22, #0x0]\n"
"subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
- "add x22, x22, #0x10\n"
- "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z31.b }, p3/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
@@ -426,53 +426,53 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x22], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p3/Z, [x20, x27]\n"
".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
- "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
- "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
- ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
- ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
- ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
- ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
- "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
- ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
- ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
- ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ "ld1rw { z21.s }, p4/Z, [%x[left_shift]]\n"
+ "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
- ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
- ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
- ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
- "add z15.s, z15.s, z16.s\n"
- "add z14.s, z14.s, z16.s\n"
- "add z13.s, z13.s, z16.s\n"
- "add z12.s, z12.s, z16.s\n"
- "mov z17.s, #0x0\n"
- "smax z15.s, p0/M, z15.s, z17.s\n"
- "smax z14.s, p0/M, z14.s, z17.s\n"
+ "mov z19.s, #0x0\n"
+ "ld1rw { z18.s }, p4/Z, [%x[right_shift]]\n"
+ "ld1rw { z17.s }, p4/Z, [x20]\n"
"mov z16.s, #0xff\n"
- "smax z13.s, p0/M, z13.s, z17.s\n"
- "smax z12.s, p0/M, z12.s, z17.s\n"
- "smin z15.s, p0/M, z15.s, z16.s\n"
- "smin z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x448292af // srshl z15.s, p4/M, z15.s, z21.s\n"
+ ".inst 0x448292ae // srshl z14.s, p4/M, z14.s, z21.s\n"
+ ".inst 0x448292ad // srshl z13.s, p4/M, z13.s, z21.s\n"
+ ".inst 0x448292ac // srshl z12.s, p4/M, z12.s, z21.s\n"
+ ".inst 0x04b475ef // sqrdmulh z15.s, z15.s, z20.s\n"
+ ".inst 0x04b475ce // sqrdmulh z14.s, z14.s, z20.s\n"
+ ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n"
+ ".inst 0x04b4758c // sqrdmulh z12.s, z12.s, z20.s\n"
+ ".inst 0x4482924f // srshl z15.s, p4/M, z15.s, z18.s\n"
+ ".inst 0x4482924e // srshl z14.s, p4/M, z14.s, z18.s\n"
+ ".inst 0x4482924d // srshl z13.s, p4/M, z13.s, z18.s\n"
+ ".inst 0x4482924c // srshl z12.s, p4/M, z12.s, z18.s\n"
+ "add z15.s, z15.s, z17.s\n"
+ "add z14.s, z14.s, z17.s\n"
+ "add z13.s, z13.s, z17.s\n"
+ "add z12.s, z12.s, z17.s\n"
+ "smax z15.s, p4/M, z15.s, z19.s\n"
+ "smax z14.s, p4/M, z14.s, z19.s\n"
+ "smax z13.s, p4/M, z13.s, z19.s\n"
+ "smax z12.s, p4/M, z12.s, z19.s\n"
+ "smin z15.s, p4/M, z15.s, z16.s\n"
+ "smin z14.s, p4/M, z14.s, z16.s\n"
+ "smin z13.s, p4/M, z13.s, z16.s\n"
+ "smin z12.s, p4/M, z12.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smin z13.s, p0/M, z13.s, z16.s\n"
- "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x27]\n"
"incb x27\n"
- "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 94522cdaaa..8308a115a4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,367 +46,367 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
"cntb x28\n"
"cntb x27, ALL, MUL #2\n"
"cntb x26, ALL, MUL #3\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
- "whilelt p2.b, x27, %x[n_channels]\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
- "ptrue p0.b\n"
+ "ptrue p4.b\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
+ "whilelt p2.b, x28, %x[n_channels]\n"
+ "whilelt p1.b, x27, %x[n_channels]\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x0\n"
- "mov z7.b, #0x0\n"
- "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
"mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z4.b, #0x0\n"
+ "mov z3.b, #0x0\n"
"cbz x25, 4f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z19, z2\n umax z19.b, p4/M, z19.b, z1.b\n"
+ "umax z23.b, p4/M, z23.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
- "umax z22.b, p0/M, z22.b, z30.b\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
- "umax z21.b, p0/M, z21.b, z27.b\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
- "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
- "umax z20.b, p0/M, z20.b, z24.b\n"
- "ld1b { z0.b }, p3/Z, [x23, x28]\n"
- "ld1b { z31.b }, p3/Z, [x22, x28]\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z18.b, p0/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p3/Z, [x21, x28]\n"
- "ld1b { z30.b }, p3/Z, [x20, x28]\n"
- "umax z17.b, p0/M, z17.b, z21.b\n"
- "umax z16.b, p0/M, z16.b, z20.b\n"
- "ld1b { z29.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
+ "umax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
+ "umax z21.b, p4/M, z21.b, z26.b\n"
+ "umax z16.b, p4/M, z16.b, z25.b\n"
+ "umax z20.b, p4/M, z20.b, z24.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "umax z19.b, p4/M, z19.b, z23.b\n"
+ "umax z18.b, p4/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "umax z17.b, p4/M, z17.b, z21.b\n"
"subs x25, x25, #0x1\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "ld1b { z21.b }, p2/Z, [x21, x27]\n"
- "ld1b { z27.b }, p2/Z, [x20, x27]\n"
- "umax z7.b, p0/M, z7.b, z18.b\n"
- "umax z6.b, p0/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x23, x26]\n"
- "ld1b { z25.b }, p1/Z, [x22, x26]\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z31.b }, p2/Z, [x23, x28]\n"
+ "ld1b { z30.b }, p2/Z, [x22, x28]\n"
+ "umax z16.b, p4/M, z16.b, z20.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z20.b }, p1/Z, [x21, x26]\n"
- "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "ld1b { z22.b }, p2/Z, [x21, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x20, x28]\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "ld1b { z28.b }, p1/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p1/Z, [x22, x27]\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "ld1b { z21.b }, p1/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x20, x27]\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p0/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p0/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
- "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
- "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
- "umax z22.b, p0/M, z22.b, z30.b\n"
- "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
- "umax z21.b, p0/M, z21.b, z27.b\n"
- "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
- "umax z20.b, p0/M, z20.b, z24.b\n"
- "umax z19.b, p0/M, z19.b, z23.b\n"
- "umax z18.b, p0/M, z18.b, z22.b\n"
- "umax z17.b, p0/M, z17.b, z21.b\n"
- "umax z16.b, p0/M, z16.b, z20.b\n"
- "umax z8.b, p0/M, z8.b, z19.b\n"
- "umax z7.b, p0/M, z7.b, z18.b\n"
- "umax z6.b, p0/M, z6.b, z17.b\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "movprfx z19, z2\n umax z19.b, p4/M, z19.b, z1.b\n"
+ "umax z23.b, p4/M, z23.b, z0.b\n"
+ "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
+ "umax z22.b, p4/M, z22.b, z29.b\n"
+ "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
+ "umax z21.b, p4/M, z21.b, z26.b\n"
+ "umax z16.b, p4/M, z16.b, z25.b\n"
+ "umax z20.b, p4/M, z20.b, z24.b\n"
+ "umax z19.b, p4/M, z19.b, z23.b\n"
+ "umax z18.b, p4/M, z18.b, z22.b\n"
+ "umax z17.b, p4/M, z17.b, z21.b\n"
+ "umax z16.b, p4/M, z16.b, z20.b\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
- "ld1b { z17.b }, p3/Z, [x20, x28]\n"
- "ld1b { z16.b }, p2/Z, [x20, x27]\n"
- "umax z7.b, p0/M, z7.b, z17.b\n"
- "umax z6.b, p0/M, z6.b, z16.b\n"
- "ld1b { z16.b }, p1/Z, [x20, x26]\n"
- "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z19.b }, p3/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p2/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p1/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p0/Z, [x20, x26]\n"
+ "umax z6.b, p4/M, z6.b, z19.b\n"
+ "umax z5.b, p4/M, z5.b, z18.b\n"
+ "umax z4.b, p4/M, z4.b, z17.b\n"
+ "umax z3.b, p4/M, z3.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z3.s }, p0/Z, [x20]\n"
- ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508ad18 // ushllt z24.h, z8.b, #0x0\n"
- ".inst 0x4508a8f7 // ushllb z23.h, z7.b, #0x0\n"
- ".inst 0x4508acf6 // ushllt z22.h, z7.b, #0x0\n"
- "neg z3.s, p0/M, z3.s\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ ".inst 0x4508a8d3 // ushllb z19.h, z6.b, #0x0\n"
+ ".inst 0x4508acd1 // ushllt z17.h, z6.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508a8d5 // ushllb z21.h, z6.b, #0x0\n"
- ".inst 0x4508acd4 // ushllt z20.h, z6.b, #0x0\n"
- "ld1rw { z2.s }, p0/Z, [x20]\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- ".inst 0x4508a8b3 // ushllb z19.h, z5.b, #0x0\n"
+ "ld1rw { z6.s }, p4/Z, [x21]\n"
+ ".inst 0x4508a8b2 // ushllb z18.h, z5.b, #0x0\n"
".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
- "ld1rw { z18.s }, p0/Z, [x20]\n"
+ "ld1rw { z5.s }, p4/Z, [x20]\n"
+ ".inst 0x4508a894 // ushllb z20.h, z4.b, #0x0\n"
+ ".inst 0x4508ac98 // ushllt z24.h, z4.b, #0x0\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x45914061 // saddwb z1.s, z3.s, z17.h\n"
- ".inst 0x45914471 // saddwt z17.s, z3.s, z17.h\n"
- ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
- ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
- ".inst 0x45984060 // saddwb z0.s, z3.s, z24.h\n"
- ".inst 0x4598447f // saddwt z31.s, z3.s, z24.h\n"
- ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
- ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
- ".inst 0x4597407e // saddwb z30.s, z3.s, z23.h\n"
- ".inst 0x4597447d // saddwt z29.s, z3.s, z23.h\n"
- ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
- ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
- ".inst 0x4596407c // saddwb z28.s, z3.s, z22.h\n"
- ".inst 0x4596447b // saddwt z27.s, z3.s, z22.h\n"
- ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
- ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
- ".inst 0x4595407a // saddwb z26.s, z3.s, z21.h\n"
- ".inst 0x45954479 // saddwt z25.s, z3.s, z21.h\n"
- ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
- ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
- ".inst 0x45944078 // saddwb z24.s, z3.s, z20.h\n"
- ".inst 0x45944477 // saddwt z23.s, z3.s, z20.h\n"
- ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
- ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
- ".inst 0x45934076 // saddwb z22.s, z3.s, z19.h\n"
- ".inst 0x45934475 // saddwt z21.s, z3.s, z19.h\n"
- ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
- ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
- ".inst 0x45904074 // saddwb z20.s, z3.s, z16.h\n"
- ".inst 0x45904473 // saddwt z19.s, z3.s, z16.h\n"
- ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
- ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
- ".inst 0x04b27631 // sqrdmulh z17.s, z17.s, z18.s\n"
+ ".inst 0x4508a877 // ushllb z23.h, z3.b, #0x0\n"
+ ".inst 0x4508ac76 // ushllt z22.h, z3.b, #0x0\n"
+ "ld1rw { z4.s }, p4/Z, [x21]\n"
+ "ld1rw { z3.s }, p4/Z, [x20]\n"
+ "neg z6.s, p4/M, z6.s\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
- ".inst 0x04b277ff // sqrdmulh z31.s, z31.s, z18.s\n"
- ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
- ".inst 0x44828211 // srshl z17.s, p0/M, z17.s, z16.s\n"
- ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
- ".inst 0x04b277bd // sqrdmulh z29.s, z29.s, z18.s\n"
- ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
- ".inst 0x4482821f // srshl z31.s, p0/M, z31.s, z16.s\n"
- ".inst 0x04b2779c // sqrdmulh z28.s, z28.s, z18.s\n"
- ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
- ".inst 0x4482821e // srshl z30.s, p0/M, z30.s, z16.s\n"
- ".inst 0x4482821d // srshl z29.s, p0/M, z29.s, z16.s\n"
- ".inst 0x04b2775a // sqrdmulh z26.s, z26.s, z18.s\n"
- ".inst 0x04b27739 // sqrdmulh z25.s, z25.s, z18.s\n"
- ".inst 0x4482821c // srshl z28.s, p0/M, z28.s, z16.s\n"
- ".inst 0x4482821b // srshl z27.s, p0/M, z27.s, z16.s\n"
- ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
- ".inst 0x04b276f7 // sqrdmulh z23.s, z23.s, z18.s\n"
- ".inst 0x4482821a // srshl z26.s, p0/M, z26.s, z16.s\n"
- ".inst 0x44828219 // srshl z25.s, p0/M, z25.s, z16.s\n"
- ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
- ".inst 0x04b276b5 // sqrdmulh z21.s, z21.s, z18.s\n"
- ".inst 0x44828218 // srshl z24.s, p0/M, z24.s, z16.s\n"
- ".inst 0x44828217 // srshl z23.s, p0/M, z23.s, z16.s\n"
- ".inst 0x04b27694 // sqrdmulh z20.s, z20.s, z18.s\n"
- ".inst 0x04b27673 // sqrdmulh z19.s, z19.s, z18.s\n"
- ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
- ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
- ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
- ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z1.s, z1.s, z16.s\n"
- "add z17.s, z17.s, z16.s\n"
- "add z0.s, z0.s, z16.s\n"
- "add z31.s, z31.s, z16.s\n"
- "add z30.s, z30.s, z16.s\n"
- "add z29.s, z29.s, z16.s\n"
- "add z28.s, z28.s, z16.s\n"
- "add z27.s, z27.s, z16.s\n"
- "add z26.s, z26.s, z16.s\n"
- "add z25.s, z25.s, z16.s\n"
- "add z24.s, z24.s, z16.s\n"
- "add z23.s, z23.s, z16.s\n"
- "add z22.s, z22.s, z16.s\n"
- "add z21.s, z21.s, z16.s\n"
- "add z20.s, z20.s, z16.s\n"
- "add z19.s, z19.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "smax z1.s, p0/M, z1.s, z16.s\n"
- "smax z17.s, p0/M, z17.s, z16.s\n"
- "smax z0.s, p0/M, z0.s, z16.s\n"
- "smax z31.s, p0/M, z31.s, z16.s\n"
- "mov z18.s, #0xff\n"
- "smax z30.s, p0/M, z30.s, z16.s\n"
- "smax z29.s, p0/M, z29.s, z16.s\n"
- "smax z28.s, p0/M, z28.s, z16.s\n"
- "smax z27.s, p0/M, z27.s, z16.s\n"
- "smax z26.s, p0/M, z26.s, z16.s\n"
- "smax z25.s, p0/M, z25.s, z16.s\n"
- "smax z24.s, p0/M, z24.s, z16.s\n"
- "smax z23.s, p0/M, z23.s, z16.s\n"
- "smax z22.s, p0/M, z22.s, z16.s\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
- "smax z20.s, p0/M, z20.s, z16.s\n"
- "smax z19.s, p0/M, z19.s, z16.s\n"
- "smin z1.s, p0/M, z1.s, z18.s\n"
- "smin z17.s, p0/M, z17.s, z18.s\n"
- "trn1 z17.h, z1.h, z17.h\n"
- "smin z0.s, p0/M, z0.s, z18.s\n"
- "smin z31.s, p0/M, z31.s, z18.s\n"
- "trn1 z16.h, z0.h, z31.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z30.s, p0/M, z30.s, z18.s\n"
- "smin z29.s, p0/M, z29.s, z18.s\n"
- "trn1 z17.h, z30.h, z29.h\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
- "smin z28.s, p0/M, z28.s, z18.s\n"
- "smin z27.s, p0/M, z27.s, z18.s\n"
- "trn1 z16.h, z28.h, z27.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z26.s, p0/M, z26.s, z18.s\n"
- "smin z25.s, p0/M, z25.s, z18.s\n"
- "trn1 z17.h, z26.h, z25.h\n"
- "st1b { z16.b }, p3, [%x[outptr], x28]\n"
- "smin z24.s, p0/M, z24.s, z18.s\n"
- "smin z23.s, p0/M, z23.s, z18.s\n"
- "trn1 z16.h, z24.h, z23.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "smin z22.s, p0/M, z22.s, z18.s\n"
- "smin z21.s, p0/M, z21.s, z18.s\n"
- "trn1 z17.h, z22.h, z21.h\n"
- "st1b { z16.b }, p2, [%x[outptr], x27]\n"
- "smin z20.s, p0/M, z20.s, z18.s\n"
- "smin z19.s, p0/M, z19.s, z18.s\n"
- "trn1 z16.h, z20.h, z19.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x26]\n"
- "incb x26, ALL, MUL #4\n"
- "whilelt p1.b, x26, %x[n_channels]\n"
+ "mov z2.s, #0x0\n"
+ "mov z1.s, #0xff\n"
+ "ld1rw { z0.s }, p4/Z, [x20]\n"
+ ".inst 0x459340df // saddwb z31.s, z6.s, z19.h\n"
+ ".inst 0x459344d3 // saddwt z19.s, z6.s, z19.h\n"
+ ".inst 0x459140de // saddwb z30.s, z6.s, z17.h\n"
+ ".inst 0x459144d1 // saddwt z17.s, z6.s, z17.h\n"
+ ".inst 0x459240dd // saddwb z29.s, z6.s, z18.h\n"
+ ".inst 0x459244d2 // saddwt z18.s, z6.s, z18.h\n"
+ ".inst 0x459040dc // saddwb z28.s, z6.s, z16.h\n"
+ ".inst 0x459044d0 // saddwt z16.s, z6.s, z16.h\n"
+ ".inst 0x448290bf // srshl z31.s, p4/M, z31.s, z5.s\n"
+ ".inst 0x448290b3 // srshl z19.s, p4/M, z19.s, z5.s\n"
+ ".inst 0x459440d5 // saddwb z21.s, z6.s, z20.h\n"
+ ".inst 0x459444d4 // saddwt z20.s, z6.s, z20.h\n"
+ ".inst 0x448290be // srshl z30.s, p4/M, z30.s, z5.s\n"
+ ".inst 0x448290b1 // srshl z17.s, p4/M, z17.s, z5.s\n"
+ ".inst 0x459840db // saddwb z27.s, z6.s, z24.h\n"
+ ".inst 0x459844da // saddwt z26.s, z6.s, z24.h\n"
+ ".inst 0x448290bd // srshl z29.s, p4/M, z29.s, z5.s\n"
+ ".inst 0x448290b2 // srshl z18.s, p4/M, z18.s, z5.s\n"
+ ".inst 0x459740d9 // saddwb z25.s, z6.s, z23.h\n"
+ ".inst 0x459744d8 // saddwt z24.s, z6.s, z23.h\n"
+ ".inst 0x448290bc // srshl z28.s, p4/M, z28.s, z5.s\n"
+ ".inst 0x448290b0 // srshl z16.s, p4/M, z16.s, z5.s\n"
+ ".inst 0x459640d7 // saddwb z23.s, z6.s, z22.h\n"
+ ".inst 0x459644d6 // saddwt z22.s, z6.s, z22.h\n"
+ ".inst 0x448290b5 // srshl z21.s, p4/M, z21.s, z5.s\n"
+ ".inst 0x448290b4 // srshl z20.s, p4/M, z20.s, z5.s\n"
+ ".inst 0x448290bb // srshl z27.s, p4/M, z27.s, z5.s\n"
+ ".inst 0x448290ba // srshl z26.s, p4/M, z26.s, z5.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x448290b9 // srshl z25.s, p4/M, z25.s, z5.s\n"
+ ".inst 0x448290b8 // srshl z24.s, p4/M, z24.s, z5.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x448290b7 // srshl z23.s, p4/M, z23.s, z5.s\n"
+ ".inst 0x448290b6 // srshl z22.s, p4/M, z22.s, z5.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x4482907f // srshl z31.s, p4/M, z31.s, z3.s\n"
+ ".inst 0x44829073 // srshl z19.s, p4/M, z19.s, z3.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x4482907e // srshl z30.s, p4/M, z30.s, z3.s\n"
+ ".inst 0x44829071 // srshl z17.s, p4/M, z17.s, z3.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x4482907d // srshl z29.s, p4/M, z29.s, z3.s\n"
+ ".inst 0x44829072 // srshl z18.s, p4/M, z18.s, z3.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x4482907c // srshl z28.s, p4/M, z28.s, z3.s\n"
+ ".inst 0x44829070 // srshl z16.s, p4/M, z16.s, z3.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x44829075 // srshl z21.s, p4/M, z21.s, z3.s\n"
+ ".inst 0x44829074 // srshl z20.s, p4/M, z20.s, z3.s\n"
+ ".inst 0x4482907b // srshl z27.s, p4/M, z27.s, z3.s\n"
+ ".inst 0x4482907a // srshl z26.s, p4/M, z26.s, z3.s\n"
+ "add z31.s, z31.s, z0.s\n"
+ "add z19.s, z19.s, z0.s\n"
+ ".inst 0x44829079 // srshl z25.s, p4/M, z25.s, z3.s\n"
+ ".inst 0x44829078 // srshl z24.s, p4/M, z24.s, z3.s\n"
+ "add z30.s, z30.s, z0.s\n"
+ "add z17.s, z17.s, z0.s\n"
+ ".inst 0x44829077 // srshl z23.s, p4/M, z23.s, z3.s\n"
+ ".inst 0x44829076 // srshl z22.s, p4/M, z22.s, z3.s\n"
+ "add z29.s, z29.s, z0.s\n"
+ "add z18.s, z18.s, z0.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "smax z31.s, p4/M, z31.s, z2.s\n"
+ "smax z19.s, p4/M, z19.s, z2.s\n"
+ "add z21.s, z21.s, z0.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "smax z30.s, p4/M, z30.s, z2.s\n"
+ "smax z17.s, p4/M, z17.s, z2.s\n"
+ "add z27.s, z27.s, z0.s\n"
+ "add z26.s, z26.s, z0.s\n"
+ "smax z29.s, p4/M, z29.s, z2.s\n"
+ "smax z18.s, p4/M, z18.s, z2.s\n"
+ "add z25.s, z25.s, z0.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "smax z28.s, p4/M, z28.s, z2.s\n"
+ "smax z16.s, p4/M, z16.s, z2.s\n"
+ "add z23.s, z23.s, z0.s\n"
+ "add z22.s, z22.s, z0.s\n"
+ "smax z21.s, p4/M, z21.s, z2.s\n"
+ "smax z20.s, p4/M, z20.s, z2.s\n"
+ "smax z27.s, p4/M, z27.s, z2.s\n"
+ "smax z26.s, p4/M, z26.s, z2.s\n"
+ "smax z25.s, p4/M, z25.s, z2.s\n"
+ "smax z24.s, p4/M, z24.s, z2.s\n"
+ "smax z23.s, p4/M, z23.s, z2.s\n"
+ "smax z22.s, p4/M, z22.s, z2.s\n"
+ "smin z31.s, p4/M, z31.s, z1.s\n"
+ "smin z19.s, p4/M, z19.s, z1.s\n"
+ "smin z30.s, p4/M, z30.s, z1.s\n"
+ "smin z17.s, p4/M, z17.s, z1.s\n"
+ "smin z29.s, p4/M, z29.s, z1.s\n"
+ "smin z18.s, p4/M, z18.s, z1.s\n"
+ "smin z28.s, p4/M, z28.s, z1.s\n"
+ "smin z16.s, p4/M, z16.s, z1.s\n"
+ "trn1 z19.h, z31.h, z19.h\n"
+ "smin z21.s, p4/M, z21.s, z1.s\n"
+ "smin z20.s, p4/M, z20.s, z1.s\n"
+ "trn1 z17.h, z30.h, z17.h\n"
+ "smin z27.s, p4/M, z27.s, z1.s\n"
+ "smin z26.s, p4/M, z26.s, z1.s\n"
+ "trn1 z18.h, z29.h, z18.h\n"
+ "smin z25.s, p4/M, z25.s, z1.s\n"
+ "smin z24.s, p4/M, z24.s, z1.s\n"
+ "trn1 z16.h, z28.h, z16.h\n"
+ "smin z23.s, p4/M, z23.s, z1.s\n"
+ "smin z22.s, p4/M, z22.s, z1.s\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "trn1 z20.b, z19.b, z17.b\n"
+ "trn1 z17.h, z27.h, z26.h\n"
+ "trn1 z19.h, z25.h, z24.h\n"
+ "trn1 z18.b, z18.b, z16.b\n"
+ "trn1 z16.h, z23.h, z22.h\n"
+ "st1b { z20.b }, p3, [%x[outptr], x9]\n"
"incb x9, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z19.b, z16.b\n"
+ "st1b { z18.b }, p2, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
+ "st1b { z17.b }, p1, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
+ "st1b { z16.b }, p0, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p0.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
"lsr x25, %x[n_valid_cells], #0x2\n"
- "mov z8.b, #0x0\n"
+ "mov z6.b, #0x0\n"
"mov x24, %x[inptrs]\n"
"cbz x25, 11f\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
"subs x25, x25, #0x1\n"
"add x24, x24, #0x20\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "movprfx z16, z2\n umax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n umax z17.b, p4/M, z17.b, z0.b\n"
"ldp x23, x22, [x24, #0x0]\n"
"ldp x21, x20, [x24, #0x10]\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
"subs x25, x25, #0x1\n"
- "ld1b { z4.b }, p4/Z, [x23, x9]\n"
- "ld1b { z3.b }, p4/Z, [x22, x9]\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
"add x24, x24, #0x20\n"
- "ld1b { z2.b }, p4/Z, [x21, x9]\n"
- "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "umax z16.b, p4/M, z16.b, z17.b\n"
+ "ld1b { z2.b }, p3/Z, [x23, x9]\n"
+ "ld1b { z1.b }, p3/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p3/Z, [x21, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x20, x9]\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
- "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
- "umax z16.b, p0/M, z16.b, z17.b\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
+ "movprfx z16, z2\n umax z16.b, p4/M, z16.b, z1.b\n"
+ "movprfx z17, z23\n umax z17.b, p4/M, z17.b, z0.b\n"
+ "umax z16.b, p4/M, z16.b, z17.b\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
"ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
"ldr x20, [x24], #0x8\n"
- "ld1b { z16.b }, p4/Z, [x20, x9]\n"
"subs x21, x21, #0x1\n"
- "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x9]\n"
+ "umax z6.b, p4/M, z6.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z18.s }, p0/Z, [x20]\n"
- ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
- ".inst 0x4508ad10 // ushllt z16.h, z8.b, #0x0\n"
- "neg z18.s, p0/M, z18.s\n"
+ "add x21, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ ".inst 0x4508a8d1 // ushllb z17.h, z6.b, #0x0\n"
+ ".inst 0x4508acda // ushllt z26.h, z6.b, #0x0\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x45914255 // saddwb z21.s, z18.s, z17.h\n"
- ".inst 0x45914654 // saddwt z20.s, z18.s, z17.h\n"
- ".inst 0x45904253 // saddwb z19.s, z18.s, z16.h\n"
- ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
- ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
- ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
- ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
- ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
- ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
- ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
+ "ld1rw { z16.s }, p4/Z, [x21]\n"
+ "ld1rw { z25.s }, p4/Z, [x20]\n"
+ "add x21, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
"add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "ld1rw { z17.s }, p0/Z, [x20]\n"
- ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "ld1rw { z24.s }, p4/Z, [x21]\n"
+ "ld1rw { z23.s }, p4/Z, [x20]\n"
"add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
- ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
- "ld1rw { z16.s }, p0/Z, [x20]\n"
- "add z21.s, z21.s, z16.s\n"
- ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
- ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
- "add z20.s, z20.s, z16.s\n"
- "add z19.s, z19.s, z16.s\n"
- "add z18.s, z18.s, z16.s\n"
- "mov z16.s, #0x0\n"
- "smax z21.s, p0/M, z21.s, z16.s\n"
- "smax z20.s, p0/M, z20.s, z16.s\n"
- "smax z19.s, p0/M, z19.s, z16.s\n"
- "smax z18.s, p0/M, z18.s, z16.s\n"
- "mov z16.s, #0xff\n"
- "smin z21.s, p0/M, z21.s, z16.s\n"
- "smin z20.s, p0/M, z20.s, z16.s\n"
- "trn1 z17.h, z21.h, z20.h\n"
- "smin z19.s, p0/M, z19.s, z16.s\n"
- "smin z18.s, p0/M, z18.s, z16.s\n"
- "trn1 z16.h, z19.h, z18.h\n"
+ "mov z22.s, #0x0\n"
+ "ld1rw { z21.s }, p4/Z, [x20]\n"
+ "mov z20.s, #0xff\n"
+ "neg z16.s, p4/M, z16.s\n"
+ ".inst 0x45914213 // saddwb z19.s, z16.s, z17.h\n"
+ ".inst 0x45914611 // saddwt z17.s, z16.s, z17.h\n"
+ ".inst 0x459a4212 // saddwb z18.s, z16.s, z26.h\n"
+ ".inst 0x459a4610 // saddwt z16.s, z16.s, z26.h\n"
+ ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n"
+ ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n"
+ ".inst 0x44829332 // srshl z18.s, p4/M, z18.s, z25.s\n"
+ ".inst 0x44829330 // srshl z16.s, p4/M, z16.s, z25.s\n"
+ ".inst 0x04b87673 // sqrdmulh z19.s, z19.s, z24.s\n"
+ ".inst 0x04b87631 // sqrdmulh z17.s, z17.s, z24.s\n"
+ ".inst 0x04b87652 // sqrdmulh z18.s, z18.s, z24.s\n"
+ ".inst 0x04b87610 // sqrdmulh z16.s, z16.s, z24.s\n"
+ ".inst 0x448292f3 // srshl z19.s, p4/M, z19.s, z23.s\n"
+ ".inst 0x448292f1 // srshl z17.s, p4/M, z17.s, z23.s\n"
+ ".inst 0x448292f2 // srshl z18.s, p4/M, z18.s, z23.s\n"
+ ".inst 0x448292f0 // srshl z16.s, p4/M, z16.s, z23.s\n"
+ "add z19.s, z19.s, z21.s\n"
+ "add z17.s, z17.s, z21.s\n"
+ "add z18.s, z18.s, z21.s\n"
+ "add z16.s, z16.s, z21.s\n"
+ "smax z19.s, p4/M, z19.s, z22.s\n"
+ "smax z17.s, p4/M, z17.s, z22.s\n"
+ "smax z18.s, p4/M, z18.s, z22.s\n"
+ "smax z16.s, p4/M, z16.s, z22.s\n"
+ "smin z19.s, p4/M, z19.s, z20.s\n"
+ "smin z17.s, p4/M, z17.s, z20.s\n"
+ "smin z18.s, p4/M, z18.s, z20.s\n"
+ "smin z16.s, p4/M, z16.s, z20.s\n"
+ "trn1 z17.h, z19.h, z17.h\n"
+ "trn1 z16.h, z18.h, z16.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "st1b { z16.b }, p3, [%x[outptr], x9]\n"
"incb x9\n"
- "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index 1ca478513c..dbd1f9516d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,17 +91,17 @@ class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
protected:
/* Compute the amount of working space required for a single thread. */
- size_t get_working_size_per_thread() const override
+ size_t get_working_size_per_thread(unsigned int n_channels) const override
{
- return sizeof(WorkingSpace) + this->m_args.n_channels * (sizeof(TInput) + sizeof(TOutput));
+ return sizeof(WorkingSpace) + n_channels * (sizeof(TInput) + sizeof(TOutput));
}
/* Initialise the working space for a thread. */
- void initialise_working_space(void *raw_ws) const override
+ void initialise_working_space(void *raw_ws, unsigned int n_channels) const override
{
auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
ws->input_buffer = ws + 1;
- ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * this->m_args.n_channels;
+ ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * n_channels;
// Fill the input buffer with an appropriate value
TInput fill_val = 0;
@@ -119,7 +119,6 @@ class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
}
auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
- auto n_channels = this->m_args.n_channels;
for (; n_channels; n_channels--)
{
*(ptr++) = fill_val;
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index ded2c75127..cb241cf76f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -136,8 +136,8 @@ class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
const OutputStage m_os;
protected:
- size_t get_working_size_per_thread() const override { return 0; }
- void initialise_working_space(void *) const override { /* Nothing */ }
+ size_t get_working_size_per_thread(unsigned int) const override { return 0; }
+ void initialise_working_space(void *, unsigned int) const override { /* Nothing */ }
/* Compute a portion of the output tensor with padding. */
void compute_tile_padded(